tests/transform_dialect/cpu/matmul.mlir - 3p/openxla/iree - Git at Google


 !A_size = tensor<3x5xf32>
 !B_size = tensor<5x3xf32>
 !C_size = tensor<3x3xf32>

 func.func @matmul_static(
     %A : !A_size, %B : !B_size, %C : !C_size) -> !C_size {
   %0 = linalg.matmul ins(%A, %B : !A_size, !B_size)
                      outs(%C : !C_size) -> !C_size
   return %0 : !C_size
 }

 // RUN: iree-opt %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-abi-transformation-pipeline \
 // RUN:   --iree-flow-transformation-pipeline \
 // RUN:   --iree-flow-dispatch-use-transform-dialect=%p/matmul_dispatch_spec.mlir | \
 // RUN: FileCheck %s --check-prefixes=DISPATCH

 // TODO: make this test drop transform dialect usage at the flow level and use:
 //   --iree-flow-transformation-pipeline --iree-flow-convert-region-to-workgroups
 // Atm the 3rd flow.dispatch.tensor.load shows as readonly instead of readwrite.

 // DISPATCH: flow.executable private @matmul_static_dispatch_0 {
 // DISPATCH:   flow.executable.export public @matmul_static_dispatch_0_matmul_3x3x5
 // DISPATCH:     builtin.module {
 // DISPATCH:       func.func @matmul_static_dispatch_0_matmul_3x3x5
 // DISPATCH:         flow.dispatch.tensor.load {{.*}}, offsets = [0, 0], sizes = [3, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:3x5xf32> -> tensor<3x5xf32>
 // DISPATCH:         flow.dispatch.tensor.load {{.*}}, offsets = [0, 0], sizes = [5, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x3xf32> -> tensor<5x3xf32>
 // DISPATCH:         flow.dispatch.tensor.load {{.*}}, offsets = [0, 0], sizes = [3, 3], strides = [1, 1] : !flow.dispatch.tensor<readwrite:3x3xf32> -> tensor<3x3xf32>
 // DISPATCH:         linalg.matmul ins({{.*}} : tensor<3x5xf32>, tensor<5x3xf32>) outs({{.*}} : tensor<3x3xf32>) -> tensor<3x3xf32>
 // DISPATCH:         flow.dispatch.tensor.store {{.*}} offsets = [0, 0], sizes = [3, 3], strides = [1, 1] : tensor<3x3xf32> -> !flow.dispatch.tensor<readwrite:3x3xf32>
 // DISPATCH:         return

 // RUN: iree-opt %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-abi-transformation-pipeline \
 // RUN:   --iree-flow-transformation-pipeline \
 // RUN:   --iree-flow-dispatch-use-transform-dialect=%p/matmul_dispatch_spec.mlir \
 // RUN:   --iree-stream-transformation-pipeline \
 // RUN:    --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target))' \
 // RUN:   --iree-codegen-llvmcpu-use-transform-dialect=%p/matmul_codegen_spec.mlir | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN

 // CODEGEN: hal.executable private @matmul_static_dispatch_0 {
 // CODEGEN:   hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
 //
 // The signature of the hal.executable.export region is subject to conventions
 // at the flow level. These conventions are materialized in IR e.g. into
 // stream.cmd.dispatch before codegen gets invoked.
 // As a consequence, the tile_size/num_threads/workgroup_count passed to
 // transform.tile_to_foreach_thread needs to be aware of this convention.
 // For now we use our own convention that sizes are static and no other bbArg
 // than !hal.device is present.
 //
 // CODEGEN:     hal.executable.export public @matmul_static_dispatch_0_matmul_3x3x5 ordinal(0) layout(#{{.*}}) attributes {translation_info = #translation} {
 // CODEGEN:       ^bb0(%{{.*}}: !hal.device):
 // CODEGEN:         arith.constant 2 : index
 // CODEGEN:         arith.constant 1 : index
 // CODEGEN:         hal.return %{{.*}}, %{{.*}}, %{{.*}} : index, index, index
 // CODEGEN:       }
 // CODEGEN:       builtin.module {
 // CODEGEN:         func.func @matmul_static_dispatch_0_matmul_3x3x5() {
 // CODEGEN:           arith.constant 0 : index
 // CODEGEN:           hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset({{.*}}) alignment(64) : memref<3x5xf32>
 // CODEGEN:           memref.assume_alignment %{{.*}}, 64 : memref<3x5xf32>
 // CODEGEN:           hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset({{.*}}) alignment(64) : memref<5x3xf32>
 // CODEGEN:           memref.assume_alignment %{{.*}}, 64 : memref<5x3xf32>
 // CODEGEN:           hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset({{.*}}) alignment(64) : memref<3x3xf32>
 // CODEGEN:           memref.assume_alignment %{{.*}}, 64 : memref<3x3xf32>
 // CODEGEN:           %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
 // CODEGEN:           affine.apply {{.*}}()[%workgroup_id_x]
 // CODEGEN:           memref.subview %{{.*}}[%{{.*}}, 0] [%{{.*}}, 5] [1, 1] : memref<3x5xf32> to memref<?x5xf32, #{{.*}}>
 // CODEGEN:           memref.subview %{{.*}}[%{{.*}}, 0] [%{{.*}}, 3] [1, 1] : memref<3x3xf32> to memref<?x3xf32, #{{.*}}>
 // CODEGEN:           linalg.matmul ins(%{{.*}}, %{{.*}} : memref<?x5xf32, #map3>, memref<5x3xf32>) outs(%{{.*}} : memref<?x3xf32, #{{.*}}>)

 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-flow-dispatch-use-transform-dialect=%p/matmul_dispatch_spec.mlir \
 // RUN:   --iree-codegen-llvmcpu-use-transform-dialect=%p/matmul_codegen_spec.mlir | \
 // RUN: iree-run-module --entry_function=matmul_static \
 // RUN:   --function_input="3x5xf32=1 1 1 1 1 1 1 1 1 1 1 1 1 1 1" \
 // RUN:   --function_input="5x3xf32=1 1 1 1 1 1 1 1 1 1 1 1 1 1 1" \
 // RUN:   --function_input="3x3xf32=0 0 0 0 0 0 0 0 0"| \
 // RUN: FileCheck %s --check-prefixes=EXEC

 // EXEC: 3x3xf32=[5 5 5][5 5 5][5 5 5]

 // RUN: iree-compile --iree-hal-target-backends=llvm-cpu \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/matmul_tiled_dispatch_spec.mlir \
 // RUN:     --iree-flow-export-benchmark-funcs %s | \
 // RUN: iree-benchmark-module --device=local-task | \
 // RUN: FileCheck %s --check-prefixes=BENCHMARK-MODULE

 // When running iree-benchmark-module, we only check the existence of the func.
 // BENCHMARK-MODULE: matmul_static

	!A_size = tensor<3x5xf32>
	!B_size = tensor<5x3xf32>
	!C_size = tensor<3x3xf32>

	func.func @matmul_static(
	%A : !A_size, %B : !B_size, %C : !C_size) -> !C_size {
	%0 = linalg.matmul ins(%A, %B : !A_size, !B_size)
	outs(%C : !C_size) -> !C_size
	return %0 : !C_size
	}

	// RUN: iree-opt %s --iree-hal-target-backends=llvm-cpu \
	// RUN: --iree-abi-transformation-pipeline \
	// RUN: --iree-flow-transformation-pipeline \
	// RUN: --iree-flow-dispatch-use-transform-dialect=%p/matmul_dispatch_spec.mlir \| \
	// RUN: FileCheck %s --check-prefixes=DISPATCH

	// TODO: make this test drop transform dialect usage at the flow level and use:
	// --iree-flow-transformation-pipeline --iree-flow-convert-region-to-workgroups
	// Atm the 3rd flow.dispatch.tensor.load shows as readonly instead of readwrite.

	// DISPATCH: flow.executable private @matmul_static_dispatch_0 {
	// DISPATCH: flow.executable.export public @matmul_static_dispatch_0_matmul_3x3x5
	// DISPATCH: builtin.module {
	// DISPATCH: func.func @matmul_static_dispatch_0_matmul_3x3x5
	// DISPATCH: flow.dispatch.tensor.load {{.*}}, offsets = [0, 0], sizes = [3, 5], strides = [1, 1] : !flow.dispatch.tensor<readonly:3x5xf32> -> tensor<3x5xf32>
	// DISPATCH: flow.dispatch.tensor.load {{.*}}, offsets = [0, 0], sizes = [5, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x3xf32> -> tensor<5x3xf32>
	// DISPATCH: flow.dispatch.tensor.load {{.*}}, offsets = [0, 0], sizes = [3, 3], strides = [1, 1] : !flow.dispatch.tensor<readwrite:3x3xf32> -> tensor<3x3xf32>
	// DISPATCH: linalg.matmul ins({{.}} : tensor<3x5xf32>, tensor<5x3xf32>) outs({{.}} : tensor<3x3xf32>) -> tensor<3x3xf32>
	// DISPATCH: flow.dispatch.tensor.store {{.*}} offsets = [0, 0], sizes = [3, 3], strides = [1, 1] : tensor<3x3xf32> -> !flow.dispatch.tensor<readwrite:3x3xf32>
	// DISPATCH: return

	// RUN: iree-opt %s --iree-hal-target-backends=llvm-cpu \
	// RUN: --iree-abi-transformation-pipeline \
	// RUN: --iree-flow-transformation-pipeline \
	// RUN: --iree-flow-dispatch-use-transform-dialect=%p/matmul_dispatch_spec.mlir \
	// RUN: --iree-stream-transformation-pipeline \
	// RUN: --iree-hal-configuration-pipeline \| \
	// RUN: iree-opt --pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target))' \
	// RUN: --iree-codegen-llvmcpu-use-transform-dialect=%p/matmul_codegen_spec.mlir \| \
	// RUN: FileCheck %s --check-prefixes=CODEGEN

	// CODEGEN: hal.executable private @matmul_static_dispatch_0 {
	// CODEGEN: hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
	//
	// The signature of the hal.executable.export region is subject to conventions
	// at the flow level. These conventions are materialized in IR e.g. into
	// stream.cmd.dispatch before codegen gets invoked.
	// As a consequence, the tile_size/num_threads/workgroup_count passed to
	// transform.tile_to_foreach_thread needs to be aware of this convention.
	// For now we use our own convention that sizes are static and no other bbArg
	// than !hal.device is present.
	//
	// CODEGEN: hal.executable.export public @matmul_static_dispatch_0_matmul_3x3x5 ordinal(0) layout(#{{.*}}) attributes {translation_info = #translation} {
	// CODEGEN: ^bb0(%{{.*}}: !hal.device):
	// CODEGEN: arith.constant 2 : index
	// CODEGEN: arith.constant 1 : index
	// CODEGEN: hal.return %{{.}}, %{{.}}, %{{.*}} : index, index, index
	// CODEGEN: }
	// CODEGEN: builtin.module {
	// CODEGEN: func.func @matmul_static_dispatch_0_matmul_3x3x5() {
	// CODEGEN: arith.constant 0 : index
	// CODEGEN: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset({{.*}}) alignment(64) : memref<3x5xf32>
	// CODEGEN: memref.assume_alignment %{{.*}}, 64 : memref<3x5xf32>
	// CODEGEN: hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset({{.*}}) alignment(64) : memref<5x3xf32>
	// CODEGEN: memref.assume_alignment %{{.*}}, 64 : memref<5x3xf32>
	// CODEGEN: hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset({{.*}}) alignment(64) : memref<3x3xf32>
	// CODEGEN: memref.assume_alignment %{{.*}}, 64 : memref<3x3xf32>
	// CODEGEN: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
	// CODEGEN: affine.apply {{.*}}()[%workgroup_id_x]
	// CODEGEN: memref.subview %{{.}}[%{{.}}, 0] [%{{.}}, 5] [1, 1] : memref<3x5xf32> to memref<?x5xf32, #{{.}}>
	// CODEGEN: memref.subview %{{.}}[%{{.}}, 0] [%{{.}}, 3] [1, 1] : memref<3x3xf32> to memref<?x3xf32, #{{.}}>
	// CODEGEN: linalg.matmul ins(%{{.}}, %{{.}} : memref<?x5xf32, #map3>, memref<5x3xf32>) outs(%{{.}} : memref<?x3xf32, #{{.}}>)

	// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
	// RUN: --iree-flow-dispatch-use-transform-dialect=%p/matmul_dispatch_spec.mlir \
	// RUN: --iree-codegen-llvmcpu-use-transform-dialect=%p/matmul_codegen_spec.mlir \| \
	// RUN: iree-run-module --entry_function=matmul_static \
	// RUN: --function_input="3x5xf32=1 1 1 1 1 1 1 1 1 1 1 1 1 1 1" \
	// RUN: --function_input="5x3xf32=1 1 1 1 1 1 1 1 1 1 1 1 1 1 1" \
	// RUN: --function_input="3x3xf32=0 0 0 0 0 0 0 0 0"\| \
	// RUN: FileCheck %s --check-prefixes=EXEC

	// EXEC: 3x3xf32=[5 5 5][5 5 5][5 5 5]

	// RUN: iree-compile --iree-hal-target-backends=llvm-cpu \
	// RUN: --iree-flow-dispatch-use-transform-dialect=%p/matmul_tiled_dispatch_spec.mlir \
	// RUN: --iree-flow-export-benchmark-funcs %s \| \
	// RUN: iree-benchmark-module --device=local-task \| \
	// RUN: FileCheck %s --check-prefixes=BENCHMARK-MODULE

	// When running iree-benchmark-module, we only check the existence of the func.
	// BENCHMARK-MODULE: matmul_static