blob: b7be7377e3d1dbb629c94b28578ef606338adc68 [file]
// Regression test for dynamic batch_matmul with WMMAR3 on RDNA3 (gfx1100).
//
// WMMAR3 has accumulator layout outer={8,1} which requires an expand_shape
// on the output. With dynamic shapes, tensor.dim users on the forall result
// previously blocked the ExpandDestinationForallOp pattern, causing a
// separate shared memory allocation for the output accumulator that exceeded
// the 65536-byte limit.
func.func @dynamic_batch_matmul_transposed_rhs() {
%lhs = flow.tensor.dynamic_constant dense<1.0> : tensor<2x128x128xf16> -> tensor<2x?x128xf16>
%rhs = flow.tensor.dynamic_constant dense<1.0> : tensor<2x128x128xf16> -> tensor<2x?x128xf16>
%cst = arith.constant 0.0 : f32
%c1 = arith.constant 1 : index
%m = tensor.dim %lhs, %c1 : tensor<2x?x128xf16>
%n = tensor.dim %rhs, %c1 : tensor<2x?x128xf16>
%init = tensor.empty(%m, %n) : tensor<2x?x?xf32>
%fill = linalg.fill ins(%cst : f32) outs(%init : tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
%observed = linalg.batch_matmul
indexing_maps = [affine_map<(b, m, n, k) -> (b, m, k)>,
affine_map<(b, m, n, k) -> (b, n, k)>,
affine_map<(b, m, n, k) -> (b, m, n)>]
ins(%lhs, %rhs : tensor<2x?x128xf16>, tensor<2x?x128xf16>)
outs(%fill : tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
// Each output element = sum(1.0 * 1.0, K=128) = 128.0
%expected = flow.tensor.dynamic_constant dense<128.0> : tensor<2x128x128xf32> -> tensor<2x?x?xf32>
check.expect_almost_eq(%observed, %expected, atol 1.0e-01) : tensor<2x?x?xf32>
return
}