tests/e2e/matmul/generate_code.py - 3p/openxla/iree - Git at Google

 #!/usr/bin/env python3
 # Copyright 2021 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """iree_generated_e2e_matmul_test generator for e2e matmul tests."""

 from typing import Optional
 import argparse
 import dataclasses
 import typing

 from tests.e2e.matmul.common import *
 from tests.e2e.matmul.compilation_info import *
 from tests.e2e.matmul.generate_code_mx import *


 # IR storage type for `t`: `ui8` → signless `i8` (the unsigned semantics
 # live on `arith.extui` in the linalg body and surface via encoding
 # inference).
 def _storage_type(t: MatrixElemTypeId) -> str:
     if t == MatrixElemTypeId.UI8:
         return "i8"
     return t.value


 # True when the matmul must be expressed as `linalg.generic` (explicit
 # body) rather than named `linalg.matmul`: distinct LHS/RHS types, or an
 # unsigned operand.
 def _needs_linalg_generic(
     lhs_type: MatrixElemTypeId, rhs_type: MatrixElemTypeId
 ) -> bool:
     if lhs_type != rhs_type:
         return True
     if lhs_type == MatrixElemTypeId.UI8:
         return True
     return False


 # Helper for generate_function.
 # Generates a name for a test function in the generated MLIR code.
 def generate_function_name(
     lhs_type: MatrixElemTypeId,
     rhs_type: MatrixElemTypeId,
     acc_type: MatrixElemTypeId,
     shapes: TestInputMatricesShapes,
     accumulate: bool,
     compilation_info: typing.Optional[CompilationInfo] = None,
 ):
     lhs_t = lhs_type.value
     rhs_t = rhs_type.value
     acc_t = acc_type.value
     lhs_r = int_or_DYN(shapes.lhs_rows)
     lhs_c = int_or_DYN(shapes.lhs_cols)
     rhs_r = int_or_DYN(shapes.rhs_rows)
     rhs_c = int_or_DYN(shapes.rhs_cols)
     acc_r = int_or_DYN(shapes.acc_rows)
     acc_c = int_or_DYN(shapes.acc_cols)

     info = ""
     if compilation_info:
         pipeline_name = compilation_info.dispatch_lowering_pass_pipeline
         # Strip #iree_gpu.pipeline<...> wrapper for use in identifiers.
         if pipeline_name.startswith("#iree_gpu.pipeline<"):
             pipeline_name = pipeline_name[len("#iree_gpu.pipeline<") : -1]
         info = f"_for_{pipeline_name}"

     matmul_kind = "matmul_accumulate" if accumulate else "matmul"

     return (
         f"{matmul_kind}_{lhs_r}x{lhs_c}x{lhs_t}_times_"
         + f"{rhs_r}x{rhs_c}x{rhs_t}_into_{acc_r}x{acc_c}x{acc_t}{info}"
     )


 # Generates a test function in the generated MLIR code.
 # The generated function will take the same arguments as linalg.matmul variants
 # and will just call linalg.matmul variants with them, returning its result.
 def generate_function(
     lhs_type: MatrixElemTypeId,
     rhs_type: MatrixElemTypeId,
     acc_type: MatrixElemTypeId,
     mx_scale_type: MatrixElemTypeId,
     mx_block_size: int,
     shape: TestShape,
     transpose_rhs: bool,
     dynamicities: tuple[Dynamicity, Dynamicity, Dynamicity],
     compilation_info: Optional[CompilationInfo] = None,
 ):
     if mx_scale_type:
         # MX matmul currently only supports same-type inputs.
         assert lhs_type == rhs_type, "MX matmul requires lhs_type == rhs_type"
         return generate_function_mx(
             lhs_rhs_type=lhs_type,
             acc_type=acc_type,
             mx_scale_type=mx_scale_type,
             mx_block_size=mx_block_size,
             shape=shape,
             transpose_rhs=transpose_rhs,
             dynamicities=dynamicities,
             compilation_info=compilation_info,
         )

     shapes = generate_shapes(shape, transpose_rhs, dynamicities)
     func_name = generate_function_name(
         lhs_type=lhs_type,
         rhs_type=rhs_type,
         acc_type=acc_type,
         shapes=shapes,
         accumulate=shape.accumulate,
         compilation_info=compilation_info,
     )
     lhs_r = int_or_question_mark(shapes.lhs_rows)
     lhs_c = int_or_question_mark(shapes.lhs_cols)
     rhs_r = int_or_question_mark(shapes.rhs_rows)
     rhs_c = int_or_question_mark(shapes.rhs_cols)
     acc_r = int_or_question_mark(shapes.acc_rows)
     acc_c = int_or_question_mark(shapes.acc_cols)

     # Use signless storage in IR (`ui8` becomes `i8` here) — the unsigned
     # semantics are expressed by `arith.extui` in the linalg body, which the
     # encoding system's signedness inference picks up.
     lhs_tensor_type = f"tensor<{lhs_r}x{lhs_c}x{_storage_type(lhs_type)}>"
     rhs_tensor_type = f"tensor<{rhs_r}x{rhs_c}x{_storage_type(rhs_type)}>"
     acc_tensor_type = f"tensor<{acc_r}x{acc_c}x{acc_type.value}>"

     (
         compilation_info_string,
         compilation_info_attr,
     ) = generate_compilation_info_string_and_attr(compilation_info)

     args = [("%lhs", lhs_tensor_type), ("%rhs", rhs_tensor_type)]
     if shape.accumulate:
         args += [("%acc", acc_tensor_type)]

     func_definition = compilation_info_string + (
         f"util.func @{func_name}("
         + ", ".join([name + ": " + ty for name, ty in args])
         + f") -> {acc_tensor_type} {{\n"
     )

     if not shape.accumulate:
         literal_zero_for_acc_type = "0.0" if "f" in acc_type.value else "0"
         if acc_r == "?" and acc_c == "?":
             func_definition += (
                 f"  %c0 = arith.constant 0 : index\n"
                 f"  %c1 = arith.constant 1 : index\n"
                 f"  %acc_dim0 = tensor.dim %lhs, %c0 : {lhs_tensor_type}\n"
                 f"  %acc_dim1 = tensor.dim %rhs, %c1 : {rhs_tensor_type}\n"
                 f"  %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : {acc_tensor_type}\n"
             )
         elif acc_r == "?":
             func_definition += (
                 f"  %c0 = arith.constant 0 : index\n"
                 f"  %acc_dim0 = tensor.dim %lhs, %c0 : {lhs_tensor_type}\n"
                 f"  %init_acc = tensor.empty(%acc_dim0) : {acc_tensor_type}\n"
             )
         elif acc_c == "?":
             func_definition += (
                 f"  %c1 = arith.constant 1 : index\n"
                 f"  %acc_dim1 = tensor.dim %rhs, %c1 : {rhs_tensor_type}\n"
                 f"  %init_acc = tensor.empty(%acc_dim1) : {acc_tensor_type}\n"
             )
         else:
             func_definition += f"  %init_acc = tensor.empty() : {acc_tensor_type}\n"
         func_definition += (
             f"  %c0_acc_type = arith.constant {literal_zero_for_acc_type}: {acc_type.value}\n"
             f"  %acc = linalg.fill ins(%c0_acc_type : {acc_type.value}) outs(%init_acc : {acc_tensor_type}) -> {acc_tensor_type}\n"
         )

     indexing_maps_attr = ""
     if transpose_rhs:
         indexing_maps_attr = "indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]"

     if _needs_linalg_generic(lhs_type, rhs_type):
         # `linalg.generic` body widens each operand to the accumulator
         # type. `arith.extui` for unsigned, `arith.extsi` for signed —
         # the encoding system's signedness inference reads this to route
         # `ui8` operands to the right MMA intrinsic (e.g. vpdpbusd).
         lhs_storage = _storage_type(lhs_type)
         rhs_storage = _storage_type(rhs_type)
         acc_t = acc_type.value
         lhs_ext = "arith.extui" if lhs_type == MatrixElemTypeId.UI8 else "arith.extsi"
         rhs_ext = "arith.extui" if rhs_type == MatrixElemTypeId.UI8 else "arith.extsi"
         # Iterators: M, N parallel; K reduction. Maps: (m,k), (k,n)/(n,k),
         # (m,n) — RHS uses (n,k) when transpose_rhs is set.
         if transpose_rhs:
             maps = (
                 "affine_map<(d0, d1, d2) -> (d0, d2)>, "
                 "affine_map<(d0, d1, d2) -> (d1, d2)>, "
                 "affine_map<(d0, d1, d2) -> (d0, d1)>"
             )
         else:
             maps = (
                 "affine_map<(d0, d1, d2) -> (d0, d2)>, "
                 "affine_map<(d0, d1, d2) -> (d2, d1)>, "
                 "affine_map<(d0, d1, d2) -> (d0, d1)>"
             )
         func_definition += (
             f"  %result = linalg.generic {{indexing_maps = [{maps}], "
             f'iterator_types = ["parallel", "parallel", "reduction"]}} '
             f"{compilation_info_attr} ins(%lhs, %rhs: {lhs_tensor_type}, "
             f"{rhs_tensor_type}) outs(%acc: {acc_tensor_type}) {{\n"
             f"  ^bb0(%a: {lhs_storage}, %b: {rhs_storage}, %c: {acc_t}):\n"
             f"    %a_ext = {lhs_ext} %a : {lhs_storage} to {acc_t}\n"
             f"    %b_ext = {rhs_ext} %b : {rhs_storage} to {acc_t}\n"
             f"    %prod = arith.muli %a_ext, %b_ext : {acc_t}\n"
             f"    %sum = arith.addi %c, %prod : {acc_t}\n"
             f"    linalg.yield %sum : {acc_t}\n"
             f"  }} -> {acc_tensor_type}\n"
             f"  util.return %result: {acc_tensor_type}\n"
             f"}}\n"
         )
     else:
         func_definition += (
             f"  %result = linalg.matmul {indexing_maps_attr} {compilation_info_attr} ins(%lhs, %rhs: {lhs_tensor_type}, {rhs_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}\n"
             f"  util.return %result: {acc_tensor_type}\n"
             f"}}\n"
         )

     signature = ", ".join([ty for name, ty in args]) + " -> {acc_tensor_type}"
     import_declaration = (
         f"util.func private @module.{func_name}("
         + ", ".join([name + ": !hal.buffer_view" for name, ty in args])
         + ") -> !hal.buffer_view"
     )
     return MLIRFunction(
         name=func_name,
         signature=signature,
         import_declaration=import_declaration,
         definition=func_definition,
     )


 call_id = 0


 # Generates the output trace for a testcase i.e. a single test function call,
 # as a dictionary to be passed to yaml.dump.
 def generate_call(
     function: MLIRFunction,
     lhs_type: MatrixElemTypeId,
     rhs_type: MatrixElemTypeId,
     acc_type: MatrixElemTypeId,
     mx_scale_type: MatrixElemTypeId,
     mx_block_size: int,
     shape: TestShape,
     transpose_rhs: bool,
 ):
     if mx_scale_type:
         return generate_call_mx(
             function=function,
             lhs_rhs_type=lhs_type,
             acc_type=acc_type,
             mx_scale_type=mx_scale_type,
             mx_block_size=mx_block_size,
             shape=shape,
             transpose_rhs=transpose_rhs,
         )

     global call_id
     func_name = f"{function.name}_{shape.m}_{shape.k}_{shape.n}"
     if shape.accumulate:
         func_name = f"{func_name}_acc"
     func_name = f"{func_name}_{call_id}"
     call_id = call_id + 1

     description = f"Matmul shape (MxKxN): {shape.m}x{shape.k}x{shape.n}"
     op = (
         f"util.func @{func_name}() attributes {{\n"
         f'  iree.reflection = {{description = "{description}"}}\n'
         "} {\n"
         "  %device_index = arith.constant 0 : index\n"
         "  %device = hal.devices.get %device_index : !hal.device\n"
     )

     lhs_shape = [shape.m, shape.k]
     if transpose_rhs:
         rhs_shape = [shape.n, shape.k]
         transpose_rhs = 1
     else:
         rhs_shape = [shape.k, shape.n]
         transpose_rhs = 0
     matmul_args = [
         ("lhs", lhs_shape, lhs_type),
         ("rhs", rhs_shape, rhs_type),
     ]
     check_args = matmul_args.copy()

     if shape.accumulate:
         matmul_args += [("acc", [shape.m, shape.n], acc_type)]
     else:
         op += "  %acc = util.null : !hal.buffer_view\n"

     for arg_name, arg_shape, arg_elemtype in matmul_args:
         op = op + generate_random_matrix(arg_name, arg_shape, arg_elemtype)
         # TODO(#16168): there's a bug with in-place input->output aliasing and
         # we work around it here by passing in a unique copy.
         if arg_name == "acc":
             # TODO(#16168): there's a bug with in-place input->output aliasing and
             # we work around it here by passing in a unique copy.
             op = op + generate_random_matrix(
                 "acc_copy", arg_shape, arg_elemtype, increment_seed=False
             )
     gen_names_and_types = lambda args_list: (
         ", ".join(["%" + name for name, shape, ty in args_list]),
         ", ".join(["!hal.buffer_view" for a in args_list]),
     )
     matmul_argnames, matmul_argtypes = gen_names_and_types(matmul_args)
     check_argnames, check_argtypes = gen_names_and_types(check_args)
     op += (
         f"  %result = util.call @module.{function.name}({matmul_argnames}) : ({matmul_argtypes}) -> !hal.buffer_view\n"
         f"  %m = arith.constant {shape.m} : i64\n"
         f"  %k = arith.constant {shape.k} : i64\n"
         f"  %n = arith.constant {shape.n} : i64\n"
         f"  %transpose_rhs = arith.constant {transpose_rhs} : i32\n"
         f"  util.call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, {check_argnames}, {'%acc_copy' if shape.accumulate else '%acc'}, %result) : (!hal.device, i64, i64, i64, i32,  {check_argtypes}, !hal.buffer_view, !hal.buffer_view) -> ()\n"
         f"  util.return\n"
         f"}}\n"
     )

     return TestCall(function=function, op=op)
	#!/usr/bin/env python3
	# Copyright 2021 The IREE Authors
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	"""iree_generated_e2e_matmul_test generator for e2e matmul tests."""

	from typing import Optional
	import argparse
	import dataclasses
	import typing

	from tests.e2e.matmul.common import *
	from tests.e2e.matmul.compilation_info import *
	from tests.e2e.matmul.generate_code_mx import *


	# IR storage type for `t`: `ui8` → signless `i8` (the unsigned semantics
	# live on `arith.extui` in the linalg body and surface via encoding
	# inference).
	def _storage_type(t: MatrixElemTypeId) -> str:
	if t == MatrixElemTypeId.UI8:
	return "i8"
	return t.value


	# True when the matmul must be expressed as `linalg.generic` (explicit
	# body) rather than named `linalg.matmul`: distinct LHS/RHS types, or an
	# unsigned operand.
	def _needs_linalg_generic(
	lhs_type: MatrixElemTypeId, rhs_type: MatrixElemTypeId
	) -> bool:
	if lhs_type != rhs_type:
	return True
	if lhs_type == MatrixElemTypeId.UI8:
	return True
	return False


	# Helper for generate_function.
	# Generates a name for a test function in the generated MLIR code.
	def generate_function_name(
	lhs_type: MatrixElemTypeId,
	rhs_type: MatrixElemTypeId,
	acc_type: MatrixElemTypeId,
	shapes: TestInputMatricesShapes,
	accumulate: bool,
	compilation_info: typing.Optional[CompilationInfo] = None,
	):
	lhs_t = lhs_type.value
	rhs_t = rhs_type.value
	acc_t = acc_type.value
	lhs_r = int_or_DYN(shapes.lhs_rows)
	lhs_c = int_or_DYN(shapes.lhs_cols)
	rhs_r = int_or_DYN(shapes.rhs_rows)
	rhs_c = int_or_DYN(shapes.rhs_cols)
	acc_r = int_or_DYN(shapes.acc_rows)
	acc_c = int_or_DYN(shapes.acc_cols)

	info = ""
	if compilation_info:
	pipeline_name = compilation_info.dispatch_lowering_pass_pipeline
	# Strip #iree_gpu.pipeline<...> wrapper for use in identifiers.
	if pipeline_name.startswith("#iree_gpu.pipeline<"):
	pipeline_name = pipeline_name[len("#iree_gpu.pipeline<") : -1]
	info = f"_for_{pipeline_name}"

	matmul_kind = "matmul_accumulate" if accumulate else "matmul"

	return (
	f"{matmul_kind}_{lhs_r}x{lhs_c}x{lhs_t}_times_"
	+ f"{rhs_r}x{rhs_c}x{rhs_t}_into_{acc_r}x{acc_c}x{acc_t}{info}"
	)


	# Generates a test function in the generated MLIR code.
	# The generated function will take the same arguments as linalg.matmul variants
	# and will just call linalg.matmul variants with them, returning its result.
	def generate_function(
	lhs_type: MatrixElemTypeId,
	rhs_type: MatrixElemTypeId,
	acc_type: MatrixElemTypeId,
	mx_scale_type: MatrixElemTypeId,
	mx_block_size: int,
	shape: TestShape,
	transpose_rhs: bool,
	dynamicities: tuple[Dynamicity, Dynamicity, Dynamicity],
	compilation_info: Optional[CompilationInfo] = None,
	):
	if mx_scale_type:
	# MX matmul currently only supports same-type inputs.
	assert lhs_type == rhs_type, "MX matmul requires lhs_type == rhs_type"
	return generate_function_mx(
	lhs_rhs_type=lhs_type,
	acc_type=acc_type,
	mx_scale_type=mx_scale_type,
	mx_block_size=mx_block_size,
	shape=shape,
	transpose_rhs=transpose_rhs,
	dynamicities=dynamicities,
	compilation_info=compilation_info,
	)

	shapes = generate_shapes(shape, transpose_rhs, dynamicities)
	func_name = generate_function_name(
	lhs_type=lhs_type,
	rhs_type=rhs_type,
	acc_type=acc_type,
	shapes=shapes,
	accumulate=shape.accumulate,
	compilation_info=compilation_info,
	)
	lhs_r = int_or_question_mark(shapes.lhs_rows)
	lhs_c = int_or_question_mark(shapes.lhs_cols)
	rhs_r = int_or_question_mark(shapes.rhs_rows)
	rhs_c = int_or_question_mark(shapes.rhs_cols)
	acc_r = int_or_question_mark(shapes.acc_rows)
	acc_c = int_or_question_mark(shapes.acc_cols)

	# Use signless storage in IR (`ui8` becomes `i8` here) — the unsigned
	# semantics are expressed by `arith.extui` in the linalg body, which the
	# encoding system's signedness inference picks up.
	lhs_tensor_type = f"tensor<{lhs_r}x{lhs_c}x{_storage_type(lhs_type)}>"
	rhs_tensor_type = f"tensor<{rhs_r}x{rhs_c}x{_storage_type(rhs_type)}>"
	acc_tensor_type = f"tensor<{acc_r}x{acc_c}x{acc_type.value}>"

	(
	compilation_info_string,
	compilation_info_attr,
	) = generate_compilation_info_string_and_attr(compilation_info)

	args = [("%lhs", lhs_tensor_type), ("%rhs", rhs_tensor_type)]
	if shape.accumulate:
	args += [("%acc", acc_tensor_type)]

	func_definition = compilation_info_string + (
	f"util.func @{func_name}("
	+ ", ".join([name + ": " + ty for name, ty in args])
	+ f") -> {acc_tensor_type} {{\n"
	)

	if not shape.accumulate:
	literal_zero_for_acc_type = "0.0" if "f" in acc_type.value else "0"
	if acc_r == "?" and acc_c == "?":
	func_definition += (
	f" %c0 = arith.constant 0 : index\n"
	f" %c1 = arith.constant 1 : index\n"
	f" %acc_dim0 = tensor.dim %lhs, %c0 : {lhs_tensor_type}\n"
	f" %acc_dim1 = tensor.dim %rhs, %c1 : {rhs_tensor_type}\n"
	f" %init_acc = tensor.empty(%acc_dim0, %acc_dim1) : {acc_tensor_type}\n"
	)
	elif acc_r == "?":
	func_definition += (
	f" %c0 = arith.constant 0 : index\n"
	f" %acc_dim0 = tensor.dim %lhs, %c0 : {lhs_tensor_type}\n"
	f" %init_acc = tensor.empty(%acc_dim0) : {acc_tensor_type}\n"
	)
	elif acc_c == "?":
	func_definition += (
	f" %c1 = arith.constant 1 : index\n"
	f" %acc_dim1 = tensor.dim %rhs, %c1 : {rhs_tensor_type}\n"
	f" %init_acc = tensor.empty(%acc_dim1) : {acc_tensor_type}\n"
	)
	else:
	func_definition += f" %init_acc = tensor.empty() : {acc_tensor_type}\n"
	func_definition += (
	f" %c0_acc_type = arith.constant {literal_zero_for_acc_type}: {acc_type.value}\n"
	f" %acc = linalg.fill ins(%c0_acc_type : {acc_type.value}) outs(%init_acc : {acc_tensor_type}) -> {acc_tensor_type}\n"
	)

	indexing_maps_attr = ""
	if transpose_rhs:
	indexing_maps_attr = "indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]"

	if _needs_linalg_generic(lhs_type, rhs_type):
	# `linalg.generic` body widens each operand to the accumulator
	# type. `arith.extui` for unsigned, `arith.extsi` for signed —
	# the encoding system's signedness inference reads this to route
	# `ui8` operands to the right MMA intrinsic (e.g. vpdpbusd).
	lhs_storage = _storage_type(lhs_type)
	rhs_storage = _storage_type(rhs_type)
	acc_t = acc_type.value
	lhs_ext = "arith.extui" if lhs_type == MatrixElemTypeId.UI8 else "arith.extsi"
	rhs_ext = "arith.extui" if rhs_type == MatrixElemTypeId.UI8 else "arith.extsi"
	# Iterators: M, N parallel; K reduction. Maps: (m,k), (k,n)/(n,k),
	# (m,n) — RHS uses (n,k) when transpose_rhs is set.
	if transpose_rhs:
	maps = (
	"affine_map<(d0, d1, d2) -> (d0, d2)>, "
	"affine_map<(d0, d1, d2) -> (d1, d2)>, "
	"affine_map<(d0, d1, d2) -> (d0, d1)>"
	)
	else:
	maps = (
	"affine_map<(d0, d1, d2) -> (d0, d2)>, "
	"affine_map<(d0, d1, d2) -> (d2, d1)>, "
	"affine_map<(d0, d1, d2) -> (d0, d1)>"
	)
	func_definition += (
	f" %result = linalg.generic {{indexing_maps = [{maps}], "
	f'iterator_types = ["parallel", "parallel", "reduction"]}} '
	f"{compilation_info_attr} ins(%lhs, %rhs: {lhs_tensor_type}, "
	f"{rhs_tensor_type}) outs(%acc: {acc_tensor_type}) {{\n"
	f" ^bb0(%a: {lhs_storage}, %b: {rhs_storage}, %c: {acc_t}):\n"
	f" %a_ext = {lhs_ext} %a : {lhs_storage} to {acc_t}\n"
	f" %b_ext = {rhs_ext} %b : {rhs_storage} to {acc_t}\n"
	f" %prod = arith.muli %a_ext, %b_ext : {acc_t}\n"
	f" %sum = arith.addi %c, %prod : {acc_t}\n"
	f" linalg.yield %sum : {acc_t}\n"
	f" }} -> {acc_tensor_type}\n"
	f" util.return %result: {acc_tensor_type}\n"
	f"}}\n"
	)
	else:
	func_definition += (
	f" %result = linalg.matmul {indexing_maps_attr} {compilation_info_attr} ins(%lhs, %rhs: {lhs_tensor_type}, {rhs_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}\n"
	f" util.return %result: {acc_tensor_type}\n"
	f"}}\n"
	)

	signature = ", ".join([ty for name, ty in args]) + " -> {acc_tensor_type}"
	import_declaration = (
	f"util.func private @module.{func_name}("
	+ ", ".join([name + ": !hal.buffer_view" for name, ty in args])
	+ ") -> !hal.buffer_view"
	)
	return MLIRFunction(
	name=func_name,
	signature=signature,
	import_declaration=import_declaration,
	definition=func_definition,
	)


	call_id = 0


	# Generates the output trace for a testcase i.e. a single test function call,
	# as a dictionary to be passed to yaml.dump.
	def generate_call(
	function: MLIRFunction,
	lhs_type: MatrixElemTypeId,
	rhs_type: MatrixElemTypeId,
	acc_type: MatrixElemTypeId,
	mx_scale_type: MatrixElemTypeId,
	mx_block_size: int,
	shape: TestShape,
	transpose_rhs: bool,
	):
	if mx_scale_type:
	return generate_call_mx(
	function=function,
	lhs_rhs_type=lhs_type,
	acc_type=acc_type,
	mx_scale_type=mx_scale_type,
	mx_block_size=mx_block_size,
	shape=shape,
	transpose_rhs=transpose_rhs,
	)

	global call_id
	func_name = f"{function.name}_{shape.m}_{shape.k}_{shape.n}"
	if shape.accumulate:
	func_name = f"{func_name}_acc"
	func_name = f"{func_name}_{call_id}"
	call_id = call_id + 1

	description = f"Matmul shape (MxKxN): {shape.m}x{shape.k}x{shape.n}"
	op = (
	f"util.func @{func_name}() attributes {{\n"
	f' iree.reflection = {{description = "{description}"}}\n'
	"} {\n"
	" %device_index = arith.constant 0 : index\n"
	" %device = hal.devices.get %device_index : !hal.device\n"
	)

	lhs_shape = [shape.m, shape.k]
	if transpose_rhs:
	rhs_shape = [shape.n, shape.k]
	transpose_rhs = 1
	else:
	rhs_shape = [shape.k, shape.n]
	transpose_rhs = 0
	matmul_args = [
	("lhs", lhs_shape, lhs_type),
	("rhs", rhs_shape, rhs_type),
	]
	check_args = matmul_args.copy()

	if shape.accumulate:
	matmul_args += [("acc", [shape.m, shape.n], acc_type)]
	else:
	op += " %acc = util.null : !hal.buffer_view\n"

	for arg_name, arg_shape, arg_elemtype in matmul_args:
	op = op + generate_random_matrix(arg_name, arg_shape, arg_elemtype)
	# TODO(#16168): there's a bug with in-place input->output aliasing and
	# we work around it here by passing in a unique copy.
	if arg_name == "acc":
	# TODO(#16168): there's a bug with in-place input->output aliasing and
	# we work around it here by passing in a unique copy.
	op = op + generate_random_matrix(
	"acc_copy", arg_shape, arg_elemtype, increment_seed=False
	)
	gen_names_and_types = lambda args_list: (
	", ".join(["%" + name for name, shape, ty in args_list]),
	", ".join(["!hal.buffer_view" for a in args_list]),
	)
	matmul_argnames, matmul_argtypes = gen_names_and_types(matmul_args)
	check_argnames, check_argtypes = gen_names_and_types(check_args)
	op += (
	f" %result = util.call @module.{function.name}({matmul_argnames}) : ({matmul_argtypes}) -> !hal.buffer_view\n"
	f" %m = arith.constant {shape.m} : i64\n"
	f" %k = arith.constant {shape.k} : i64\n"
	f" %n = arith.constant {shape.n} : i64\n"
	f" %transpose_rhs = arith.constant {transpose_rhs} : i32\n"
	f" util.call @matmul_test.check_matmul_results(%device, %m, %k, %n, %transpose_rhs, {check_argnames}, {'%acc_copy' if shape.accumulate else '%acc'}, %result) : (!hal.device, i64, i64, i64, i32, {check_argtypes}, !hal.buffer_view, !hal.buffer_view) -> ()\n"
	f" util.return\n"
	f"}}\n"
	)

	return TestCall(function=function, op=op)