Bump SHARK-TestSuite to include central spec. (#17283)
ci-exactly: build_packages, regression_test_cpu,
regression_test_amdgpu_vulkan, regression_test_amdgpu_rocm,
regression_test_nvidiagpu_vulkan, regression_test_nvidiagpu_cuda
diff --git a/.github/workflows/pkgci_regression_test_amdgpu_rocm.yml b/.github/workflows/pkgci_regression_test_amdgpu_rocm.yml
index 79aae61..f7dc0db 100644
--- a/.github/workflows/pkgci_regression_test_amdgpu_rocm.yml
+++ b/.github/workflows/pkgci_regression_test_amdgpu_rocm.yml
@@ -60,7 +60,7 @@
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
- ref: 9e9774632d32674e9ffa44b0a2ffc054f6c590ad
+ ref: 337083616ae6f596c0206a9edd1c47e8afc0e400
path: SHARK-TestSuite
submodules: false
- name: Installing external TestSuite Python requirements
@@ -99,6 +99,7 @@
IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
VENV_DIR: ${{ github.workspace }}/venv
IREE_TEST_FILES: ~/iree_tests_cache
+ IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
steps:
- name: Checking out IREE repository
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
@@ -123,7 +124,7 @@
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
- ref: 9e9774632d32674e9ffa44b0a2ffc054f6c590ad
+ ref: 337083616ae6f596c0206a9edd1c47e8afc0e400
path: SHARK-TestSuite
submodules: false
lfs: true
diff --git a/.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml b/.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml
index 9d5ba18..e6e669e 100644
--- a/.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml
+++ b/.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml
@@ -57,7 +57,7 @@
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
- ref: 9e9774632d32674e9ffa44b0a2ffc054f6c590ad
+ ref: 337083616ae6f596c0206a9edd1c47e8afc0e400
path: SHARK-TestSuite
submodules: false
- name: Installing external TestSuite Python requirements
@@ -106,7 +106,7 @@
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
- ref: 9e9774632d32674e9ffa44b0a2ffc054f6c590ad
+ ref: 337083616ae6f596c0206a9edd1c47e8afc0e400
path: SHARK-TestSuite
submodules: false
lfs: true
diff --git a/.github/workflows/pkgci_regression_test_cpu.yml b/.github/workflows/pkgci_regression_test_cpu.yml
index 19e2bb0..b2c2675 100644
--- a/.github/workflows/pkgci_regression_test_cpu.yml
+++ b/.github/workflows/pkgci_regression_test_cpu.yml
@@ -57,7 +57,7 @@
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
- ref: 9e9774632d32674e9ffa44b0a2ffc054f6c590ad
+ ref: 337083616ae6f596c0206a9edd1c47e8afc0e400
path: SHARK-TestSuite
submodules: false
lfs: true
@@ -122,7 +122,7 @@
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
- ref: 9e9774632d32674e9ffa44b0a2ffc054f6c590ad
+ ref: 337083616ae6f596c0206a9edd1c47e8afc0e400
path: SHARK-TestSuite
submodules: false
lfs: true
diff --git a/.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml b/.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml
index 0d28a55..79a935a 100644
--- a/.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml
+++ b/.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml
@@ -57,7 +57,7 @@
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
- ref: 9e9774632d32674e9ffa44b0a2ffc054f6c590ad
+ ref: 337083616ae6f596c0206a9edd1c47e8afc0e400
path: SHARK-TestSuite
submodules: false
- name: Installing external TestSuite Python requirements
diff --git a/.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml b/.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml
index 60dd4d5..7c3bdf2 100644
--- a/.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml
+++ b/.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml
@@ -57,7 +57,7 @@
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
repository: nod-ai/SHARK-TestSuite
- ref: 9e9774632d32674e9ffa44b0a2ffc054f6c590ad
+ ref: 337083616ae6f596c0206a9edd1c47e8afc0e400
path: SHARK-TestSuite
submodules: false
- name: Installing external TestSuite Python requirements
diff --git a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
new file mode 100644
index 0000000..9e799d4
--- /dev/null
+++ b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
@@ -0,0 +1,475 @@
+// Transform dialect specification for attention on MI300 with MFMA.
+// This script only supports variants of attention with a sequence
+// length that is a multiple of 64. There are two near duplicate
+// because we need different tile sizes when the head dimension is 512.
+// TODO: Figure out how to parameterize the tile sizes without duplicating
+// the attention function.
+
+#layout_16 = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>
+#layout = #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>
+
+module attributes { transform.with_named_sequence } {
+//===----------------------------------------------------------------------===//
+// Attention
+//===----------------------------------------------------------------------===//
+
+ // Utility matching for finding all undistributed fills.
+ transform.named_sequence @matcher(%arg0: !transform.any_op {transform.readonly}) -> !transform.any_op {
+ transform.match.operation_name %arg0 ["linalg.fill"] : !transform.any_op
+ %0 = transform.get_parent_op %arg0 {allow_empty_results, nth_parent = 2 : i64, op_name = "scf.forall"} : (!transform.any_op) -> !transform.any_op
+ transform.match.operation_empty %0 : !transform.any_op
+ transform.yield %arg0 : !transform.any_op
+ }
+
+ transform.named_sequence @get_undistributed_fills(%arg0: !transform.any_op {transform.readonly}) -> !transform.any_op {
+ %0 = transform.collect_matching @matcher in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.yield %0 : !transform.any_op
+ }
+
+ // Script for FA2 transform pipeline when head_dim % 64 = 0.
+ transform.named_sequence @__attention_main(%variant_op: !transform.any_op {transform.readonly}) {
+ // Get attention op
+ // ==========================================
+ %attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+ // Tile and distribute to workgroups
+ // ==========================================
+ %tiled_attention, %forall_grid =
+ transform.structured.tile_using_forall %attention tile_sizes [1, 128]
+ ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+
+ // Tile batch dimensions of attention
+ // ==========================================
+ %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %top_level_func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %top_level_func : !transform.any_op
+
+ // Promote query and output operands
+ // ==========================================
+ //%attention3 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ //%promoted_attention, %alloc_a0, %alloc_a1 = transform.iree.promote_operands %attention3 [0, 3]
+ // : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Tile and decompose attention
+ // ==========================================
+ %attention4 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %acc_fill, %max_fill, %sum_fill, %inner_loop, %final_scaling, %last_truncate, %blocked_attention = transform.iree.tile_attention %attention4 {tile_size = 32} :
+ (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+ %scale_q, %fill_op, %first_matmul, %reduce_max, %partial_softmax, %scale_factor, %update, %reduce_sum, %truncate, %scale_acc, %second_matmul
+ = transform.iree.decompose_tiled_attention %blocked_attention {tile_size = 32} :
+ (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Promote key and value operands
+ // ==========================================
+ %promoted_first_matmul, %alloc0 = transform.iree.promote_operands %first_matmul [1]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %promoted_second_matmul, %alloc1 = transform.iree.promote_operands %second_matmul [1]
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Tile and fuse attention ops
+ // ==========================================
+ %tiled_matmul, %forall = transform.structured.tile_using_forall %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %tiled_reduce_sum, %forall_reduce = transform.structured.tile_using_forall %reduce_sum tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+
+ %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ %loop4 = transform.loop.fuse_sibling %forall_reduce into %loop1 : (!transform.any_op, !transform.any_op) -> !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ %f5_1, %loop5_1 = transform.structured.fuse_into_containing_op %update into %loop4 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.apply_cse to %func : !transform.any_op
+
+ %f5, %loop5 = transform.structured.fuse_into_containing_op %scale_factor into %loop5_1 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f6, %loop6 = transform.structured.fuse_into_containing_op %partial_softmax into %loop5 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.apply_cse to %func : !transform.any_op
+
+ %f7, %loop7 = transform.structured.fuse_into_containing_op %reduce_max into %loop6 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f8, %loop8 = transform.structured.fuse_into_containing_op %promoted_first_matmul into %loop7 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ %f9, %loop9 = transform.structured.fuse_into_containing_op %fill_op into %loop8 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ %f10, %loop10 = transform.structured.fuse_into_containing_op %scale_q into %loop9 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ // Distribute fills
+ // ==========================================
+
+ // Get all fills that haven't been distributed to warps.
+ %fills = transform.include @get_undistributed_fills failures(propagate) (%variant_op) : (!transform.any_op) -> !transform.any_op
+ %tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Distribute last_truncate and fuse final_scaling into it
+ // ==========================================
+ %tiled_truncate, %loop_truncate = transform.structured.tile_using_forall %last_truncate tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %final_scaling into %loop_truncate : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ // Vectorize function
+ // ==========================================
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> (!transform.any_op)
+
+ // Bufferization
+ // ==========================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func_3 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
+ transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
+ %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
+
+ // Step 5. Pre-process the contract and transfer ops to put it in the right form.
+ // ===========================================================================
+ transform.apply_patterns to %memref_func {
+ transform.apply_patterns.vector.fold_arith_extension
+ } : !transform.any_op
+
+ // Step 6. Post-bufferization vector distribution
+ // ===========================================================================
+ transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [64, 4, 1] subgroup_size = 64 : (!transform.any_op) -> ()
+
+ transform.apply_patterns to %memref_func {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ } : !transform.any_op
+ transform.iree.apply_licm %memref_func : !transform.any_op
+ transform.apply_patterns to %memref_func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %memref_func : !transform.any_op
+ %func_8 = transform.structured.hoist_redundant_vector_transfers %memref_func
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_8 {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func_8 : !transform.any_op
+ transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
+
+ // Apply chained matmul optimization.
+ transform.apply_registered_pass "iree-amdgpu-prepare-chained-matmul" to %func_8 : (!transform.any_op) -> (!transform.any_op)
+
+ // Get the vector.contract ops.
+ %contracts = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %contract1, %contract2 = transform.split_handle %contracts : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
+ transform.iree.set_contraction_layout_attributes %contract1, %layout16x16x16 { read_layout_indices = array<i64: 0, 1> } : !transform.any_op, !transform.any_param
+ transform.iree.set_contraction_layout_attributes %contract2, %layout16x16x16 : !transform.any_op, !transform.any_param
+
+ %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %distribute_func_2 = transform.iree.amdgpu_distribute_vectors %distribute_func : (!transform.any_op) -> !transform.any_op
+
+ transform.apply_patterns to %distribute_func_2 {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %distribute_func_2 : !transform.any_op
+
+ // Distribute shared memory copies
+ // ==========================================
+ transform.iree.gpu_distribute_shared_memory_copy %distribute_func_2 : (!transform.any_op) -> ()
+ transform.apply_patterns to %distribute_func_2 {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.linalg.tiling_canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %distribute_func_2 : !transform.any_op
+
+ %forop = transform.structured.match ops{["scf.for"]} in %distribute_func_2 : (!transform.any_op) -> !transform.any_op
+ %prefetched_forop = transform.iree.prefetch_shared_memory_copies %forop : (!transform.any_op) -> (!transform.any_op)
+
+ transform.apply_patterns to %distribute_func_2 {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.linalg.tiling_canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %distribute_func_2 : !transform.any_op
+
+ transform.iree.reduce_shared_memory_bank_conflicts %distribute_func_2 : (!transform.any_op) -> ()
+
+ transform.yield
+ }
+
+ // Script for FA2 transform pipeline for head_dim = 512.
+ // For head_dim = 512, since the matmul is so big, and just try to do a single wave big load + big mfma.
+ transform.named_sequence @__attention_main_len_512(%variant_op: !transform.any_op {transform.readonly}) {
+ // Get attention op
+ // ==========================================
+ %attention = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+ // Tile and distribute to workgroups
+ // ==========================================
+ %tiled_attention, %forall_grid =
+ transform.structured.tile_using_forall %attention tile_sizes [1, 64]
+ ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+
+ // Tile batch dimensions of attention
+ // ==========================================
+ %attention2 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %batch_tiled_attn, %loop = transform.structured.tile_using_for %attention2 [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %top_level_func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %top_level_func : !transform.any_op
+
+ // Promote query and output operands
+ // ==========================================
+ //%attention3 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ //%promoted_attention, %alloc_a0, %alloc_a1 = transform.iree.promote_operands %attention3 [0, 3]
+ // : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Tile and decompose attention
+ // ==========================================
+ %attention4 = transform.structured.match ops{["iree_linalg_ext.attention"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %acc_fill, %max_fill, %sum_fill, %inner_loop, %final_scaling, %last_truncate, %blocked_attention = transform.iree.tile_attention %attention4 {tile_size = 64} :
+ (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+ %scale_q, %fill_op, %first_matmul, %reduce_max, %partial_softmax, %scale_factor, %update, %reduce_sum, %truncate, %scale_acc, %second_matmul
+ = transform.iree.decompose_tiled_attention %blocked_attention {tile_size = 64} :
+ (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Promote key and value operands
+ // ==========================================
+ // %promoted_first_matmul, %alloc0 = transform.iree.promote_operands %first_matmul [1]
+ // : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // %promoted_second_matmul, %alloc1 = transform.iree.promote_operands %second_matmul [1]
+ // : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Tile and fuse attention ops
+ // ==========================================
+ %tiled_matmul, %forall = transform.structured.tile_using_forall %second_matmul tile_sizes [16] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %tiled_reduce_sum, %forall_reduce = transform.structured.tile_using_forall %reduce_sum tile_sizes [16] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+
+ %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ %loop4 = transform.loop.fuse_sibling %forall_reduce into %loop1 : (!transform.any_op, !transform.any_op) -> !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ %f5_1, %loop5_1 = transform.structured.fuse_into_containing_op %update into %loop4 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.apply_cse to %func : !transform.any_op
+
+ %f5, %loop5 = transform.structured.fuse_into_containing_op %scale_factor into %loop5_1 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f6, %loop6 = transform.structured.fuse_into_containing_op %partial_softmax into %loop5 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.apply_cse to %func : !transform.any_op
+
+ %f7, %loop7 = transform.structured.fuse_into_containing_op %reduce_max into %loop6 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %f8, %loop8 = transform.structured.fuse_into_containing_op %first_matmul into %loop7 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ %f9, %loop9 = transform.structured.fuse_into_containing_op %fill_op into %loop8 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ %f10, %loop10 = transform.structured.fuse_into_containing_op %scale_q into %loop9 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ // Distribute fills
+ // ==========================================
+
+ // Get all fills that haven't been distributed to warps.
+ %fills = transform.include @get_undistributed_fills failures(propagate) (%variant_op) : (!transform.any_op) -> !transform.any_op
+ %tiled_fill, %fill_grid = transform.structured.tile_using_forall %fills tile_sizes[16] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Distribute last_truncate and fuse final_scaling into it
+ // ==========================================
+ %tiled_truncate, %loop_truncate = transform.structured.tile_using_forall %last_truncate tile_sizes[16] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ transform.structured.fuse_into_containing_op %final_scaling into %loop_truncate : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ transform.apply_patterns to %func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func : !transform.any_op
+
+ // Vectorize function
+ // ==========================================
+ transform.apply_patterns to %func {
+ transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+ transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+ transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+ } : !transform.any_op
+ %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> (!transform.any_op)
+
+ // Bufferization
+ // ==========================================
+ transform.apply_patterns to %func_3 {
+ transform.apply_patterns.tensor.reassociative_reshape_folding
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.iree.fold_fill_into_pad
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func_3 : !transform.any_op
+ transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
+ transform.apply_patterns to %func_3 { transform.apply_patterns.linalg.erase_unnecessary_inputs } : !transform.any_op
+ %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
+
+ // Step 5. Pre-process the contract and transfer ops to put it in the right form.
+ // ===========================================================================
+ transform.apply_patterns to %memref_func {
+ transform.apply_patterns.vector.fold_arith_extension
+ } : !transform.any_op
+
+ // Step 6. Post-bufferization vector distribution
+ // ===========================================================================
+ transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+ transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [64, 4, 1] subgroup_size = 64 : (!transform.any_op) -> ()
+
+ transform.apply_patterns to %memref_func {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ } : !transform.any_op
+ transform.iree.apply_licm %memref_func : !transform.any_op
+ transform.apply_patterns to %memref_func {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %memref_func : !transform.any_op
+ %func_8 = transform.structured.hoist_redundant_vector_transfers %memref_func
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_8 {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func_8 : !transform.any_op
+ transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
+
+ // Apply chained matmul optimization.
+ transform.apply_registered_pass "iree-amdgpu-prepare-chained-matmul" to %func_8 : (!transform.any_op) -> (!transform.any_op)
+
+ // transform.print %memref_func : !transform.any_op
+
+ // Get the vector.contract ops.
+ %contracts = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %contract1, %contract2 = transform.split_handle %contracts : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ %layout16x16x16 = transform.param.constant #layout_16 -> !transform.any_param
+ transform.iree.set_contraction_layout_attributes %contract1, %layout16x16x16 { read_layout_indices = array<i64: 0, 1> } : !transform.any_op, !transform.any_param
+ transform.iree.set_contraction_layout_attributes %contract2, %layout16x16x16 : !transform.any_op, !transform.any_param
+
+ %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+ %distribute_func_2 = transform.iree.amdgpu_distribute_vectors %distribute_func : (!transform.any_op) -> !transform.any_op
+
+ transform.apply_patterns to %distribute_func_2 {
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %distribute_func_2 : !transform.any_op
+
+ // Distribute shared memory copies
+ // ==========================================
+ %func_10 = transform.structured.match ops{["func.func"]} in %distribute_func_2 : (!transform.any_op) -> !transform.any_op
+ transform.iree.gpu_distribute_shared_memory_copy %func_10 : (!transform.any_op) -> ()
+ transform.apply_patterns to %func_10 {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.linalg.tiling_canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func_10 : !transform.any_op
+
+ %forop = transform.structured.match ops{["scf.for"]} in %distribute_func_2 : (!transform.any_op) -> !transform.any_op
+ %prefetched_forop = transform.iree.prefetch_shared_memory_copies %forop : (!transform.any_op) -> (!transform.any_op)
+
+ transform.apply_patterns to %func_10 {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.linalg.tiling_canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %func_10 : !transform.any_op
+
+ %func_11 = transform.structured.match ops{["func.func"]} in %distribute_func_2 : (!transform.any_op) -> !transform.any_op
+ transform.iree.reduce_shared_memory_bank_conflicts %func_11 : (!transform.any_op) -> ()
+
+ transform.yield
+ }
+
+ // Send it down a custom transform dialect pipeline.
+ transform.named_sequence @custom_attention_len_512(%attention: !transform.any_op {transform.readonly}) {
+ %func = transform.get_parent_op %attention {op_name = "func.func"} : (!transform.any_op) -> !transform.any_op
+ %attn = transform.param.constant #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @__attention_main_len_512, {"amdgpu-waves-per-eu" = 1}> -> !transform.any_param
+ transform.annotate %func "translation_info" = %attn : !transform.any_op, !transform.any_param
+ transform.yield
+ }
+
+ transform.named_sequence @match_attention_len_512(%attention: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+ transform.match.operation_name %attention ["iree_linalg_ext.attention"] : !transform.any_op
+ %in0 = transform.get_operand %attention[0] : (!transform.any_op) -> !transform.any_value
+ transform.iree.match.cast_compatible_type %in0 = tensor<?x?x512xf16> : !transform.any_value
+ transform.yield %attention : !transform.any_op
+ }
+
+ // Send it down a custom transform dialect pipeline.
+ transform.named_sequence @custom_attention(%attention: !transform.any_op {transform.readonly}) {
+ %func = transform.get_parent_op %attention {op_name = "func.func"} : (!transform.any_op) -> !transform.any_op
+ %attn = transform.param.constant #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @__attention_main, {"amdgpu-waves-per-eu" = 2}> -> !transform.any_param
+ transform.annotate %func "translation_info" = %attn : !transform.any_op, !transform.any_param
+ transform.yield
+ }
+
+ transform.named_sequence @match_attention(%attention: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
+ transform.match.operation_name %attention ["iree_linalg_ext.attention"] : !transform.any_op
+ %in0 = transform.get_operand %attention[0] : (!transform.any_op) -> !transform.any_value
+ transform.iree.match.cast_compatible_type %in0 = tensor<?x?x?xf16> : !transform.any_value
+ transform.iree.match.dim_is_multiple_of %in0[2], 64 : !transform.any_value
+ transform.yield %attention : !transform.any_op
+ }
+
+//===----------------------------------------------------------------------===//
+// Entry point
+//===----------------------------------------------------------------------===//
+
+ transform.named_sequence @__kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
+ transform.foreach_match in %variant_op
+ // Attention.
+ @match_attention_len_512 -> @custom_attention_len_512,
+ @match_attention -> @custom_attention
+ : (!transform.any_op) -> (!transform.any_op)
+ transform.yield
+ }
+} //// module
diff --git a/build_tools/pkgci/external_test_suite/gpu_rocm_models_additional_flags_gfx90a.json b/build_tools/pkgci/external_test_suite/gpu_rocm_models_additional_flags_gfx90a.json
index aa68967..7a87b65 100644
--- a/build_tools/pkgci/external_test_suite/gpu_rocm_models_additional_flags_gfx90a.json
+++ b/build_tools/pkgci/external_test_suite/gpu_rocm_models_additional_flags_gfx90a.json
@@ -4,7 +4,7 @@
"--iree-hal-target-backends=rocm",
"--iree-rocm-target-chip=gfx90a",
"--iree-opt-const-eval=false",
- "--iree-codegen-transform-dialect-library=attention_and_matmul_spec.mlir"
+ "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir"
],
"iree_run_module_flags": [
"--device=hip"
diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json
index 5dd6e05..54d4e1d 100644
--- a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json
+++ b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json
@@ -4,7 +4,7 @@
"--iree-hal-target-backends=rocm",
"--iree-rocm-target-chip=gfx90a",
"--iree-opt-const-eval=false",
- "--iree-codegen-transform-dialect-library=attention_and_matmul_spec.mlir",
+ "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir",
"--iree-global-opt-propagate-transposes=true",
"--iree-global-opt-enable-fuse-horizontal-contractions=true",
"--iree-flow-enable-aggressive-fusion=true",