tests/e2e/regression/dynamic_batch_matmul_gfx1100.mlir - 3p/openxla/iree - Git at Google

 // Regression test for dynamic batch_matmul with WMMAR3 on RDNA3 (gfx1100).
 //
 // WMMAR3 has accumulator layout outer={8,1} which requires an expand_shape
 // on the output. With dynamic shapes, tensor.dim users on the forall result
 // previously blocked the ExpandDestinationForallOp pattern, causing a
 // separate shared memory allocation for the output accumulator that exceeded
 // the 65536-byte limit.

 func.func @dynamic_batch_matmul_transposed_rhs() {
   %lhs = flow.tensor.dynamic_constant dense<1.0> : tensor<2x128x128xf16> -> tensor<2x?x128xf16>
   %rhs = flow.tensor.dynamic_constant dense<1.0> : tensor<2x128x128xf16> -> tensor<2x?x128xf16>

   %cst = arith.constant 0.0 : f32
   %c1 = arith.constant 1 : index
   %m = tensor.dim %lhs, %c1 : tensor<2x?x128xf16>
   %n = tensor.dim %rhs, %c1 : tensor<2x?x128xf16>
   %init = tensor.empty(%m, %n) : tensor<2x?x?xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
   %observed = linalg.batch_matmul
     indexing_maps = [affine_map<(b, m, n, k) -> (b, m, k)>,
                      affine_map<(b, m, n, k) -> (b, n, k)>,
                      affine_map<(b, m, n, k) -> (b, m, n)>]
     ins(%lhs, %rhs : tensor<2x?x128xf16>, tensor<2x?x128xf16>)
     outs(%fill : tensor<2x?x?xf32>) -> tensor<2x?x?xf32>

   // Each output element = sum(1.0 * 1.0, K=128) = 128.0
   %expected = flow.tensor.dynamic_constant dense<128.0> : tensor<2x128x128xf32> -> tensor<2x?x?xf32>
   check.expect_almost_eq(%observed, %expected, atol 1.0e-01) : tensor<2x?x?xf32>
   return
 }
	// Regression test for dynamic batch_matmul with WMMAR3 on RDNA3 (gfx1100).
	//
	// WMMAR3 has accumulator layout outer={8,1} which requires an expand_shape
	// on the output. With dynamic shapes, tensor.dim users on the forall result
	// previously blocked the ExpandDestinationForallOp pattern, causing a
	// separate shared memory allocation for the output accumulator that exceeded
	// the 65536-byte limit.

	func.func @dynamic_batch_matmul_transposed_rhs() {
	%lhs = flow.tensor.dynamic_constant dense<1.0> : tensor<2x128x128xf16> -> tensor<2x?x128xf16>
	%rhs = flow.tensor.dynamic_constant dense<1.0> : tensor<2x128x128xf16> -> tensor<2x?x128xf16>

	%cst = arith.constant 0.0 : f32
	%c1 = arith.constant 1 : index
	%m = tensor.dim %lhs, %c1 : tensor<2x?x128xf16>
	%n = tensor.dim %rhs, %c1 : tensor<2x?x128xf16>
	%init = tensor.empty(%m, %n) : tensor<2x?x?xf32>
	%fill = linalg.fill ins(%cst : f32) outs(%init : tensor<2x?x?xf32>) -> tensor<2x?x?xf32>
	%observed = linalg.batch_matmul
	indexing_maps = [affine_map<(b, m, n, k) -> (b, m, k)>,
	affine_map<(b, m, n, k) -> (b, n, k)>,
	affine_map<(b, m, n, k) -> (b, m, n)>]
	ins(%lhs, %rhs : tensor<2x?x128xf16>, tensor<2x?x128xf16>)
	outs(%fill : tensor<2x?x?xf32>) -> tensor<2x?x?xf32>

	// Each output element = sum(1.0 * 1.0, K=128) = 128.0
	%expected = flow.tensor.dynamic_constant dense<128.0> : tensor<2x128x128xf32> -> tensor<2x?x?xf32>
	check.expect_almost_eq(%observed, %expected, atol 1.0e-01) : tensor<2x?x?xf32>
	return
	}