Blame - tests/compiler_driver/streams.mlir - 3p/openxla/iree

blob: 03ebbc331527f574b1370988b9b60a62e02f85b6 [file] [log] [blame]

Stella Laurenzo	f44859d	2022-05-31 20:00:19 -0700	[diff] [blame]	1	// RUN: iree-compile --split-input-file --iree-hal-target-backends=vmvx \
				2	// RUN: --output-format=vm-bytecode \
				3	// RUN: --iree-vm-bytecode-module-output-format=flatbuffer-text %s \
				4	// RUN: --mlir-print-ir-after=iree-vm-ordinal-allocation 2>&1 \| FileCheck %s
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	5
				6	// This file has a few test programs that show how to mix `flow` dispatches into
				7	// those created by the `linalg` dispatch region formation: the idea is to use
				8	// any normal IREE input (mhlo/tosa/linalg/etc) on tensors and then also include
				9	// `flow.dispatch` ops calling `stream.executable`s. `flow.executable`s could be
				10	// used too but currently have some ergonomics issues that need to be resolved;
				11	// the improved version of `flow.dispatch` (and `flow.dispatch.workgroups`) will
				12	// be made part of the public `iree` dialect at which time this file will change
				13	// to using that. The `flow`/`stream` dialects are generally not considered
				14	// stable.
				15
				16	// A simple element-wise multiply of two static tensors:
				17	// %ret0 = %arg0 * %arg1
				18	//
				19	// The host code performs the dispatch with a workload of 4x1x1 - how many
				20	// workgroups that gets distributed across is left to the HAL backend to decide
				21	// based on the target device and how the work is tiled.
				22	//
				23	// The device code in the stream.executable is tiled - but does not need to be:
				24	// the only thing we care about at this level is the bindings and any operands
				25	// that may need to be passed from host->device.
				26
				27	// CHECK-LABEL: vm.module public @e2e
				28	module @e2e {
				29	// CHECK: vm.rodata private @executable_0_vmvx_bytecode_fb
				30	stream.executable private @executable_0 {
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	31	stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) {
MaheshRavishankar	83b0103	2022-09-09 11:13:30 -0700	[diff] [blame]	32	%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	33	stream.return %x, %y, %z : index, index, index
				34	}
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	35	builtin.module {
Thomas	f0f64ca	2022-03-30 10:38:01 -0700	[diff] [blame]	36	func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %ret0: !stream.binding) {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	37	%c0 = arith.constant 0 : index
MaheshRavishankar	4552fda	2022-10-25 16:52:39 -0700	[diff] [blame]	38	%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
				39	%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
				40	%2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
				41	%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
				42	%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
Thomas	8f39d27	2022-10-12 14:12:03 -0700	[diff] [blame]	43	%5 = tensor.empty() : tensor<4xf32>
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	44	%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	45	^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
				46	%10 = arith.mulf %arg4, %arg5 : f32
				47	linalg.yield %10 : f32
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	48	} -> tensor<4xf32>
MaheshRavishankar	4552fda	2022-10-25 16:52:39 -0700	[diff] [blame]	49	flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	50	return
				51	}
				52	}
				53	}
				54	// CHECK: vm.func private @simple_mul
Thomas	f0f64ca	2022-03-30 10:38:01 -0700	[diff] [blame]	55	func.func @simple_mul(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	56	%c4 = arith.constant 4 : index
Ben Vanik	e2a2b2b	2024-08-22 11:56:59 -0700	[diff] [blame]	57	// CHECK: vm.call.variadic @hal.command_buffer.dispatch
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	58	%ret0 = flow.dispatch @executable_0::@dispatch[%c4](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	59	return %ret0 : tensor<4xf32>
				60	}
				61	} // module
				62
				63	// -----
				64
				65	// The same element-wise multiply but now in-place:
				66	// %arg0 = %arg0 * %arg1
				67	//
				68	// In-place operations can often introduce false dependencies between dispatches
				69	// and should be avoided at this level in most cases - there's currently no cost
				70	// model for making dispatches into in-place operations but it's something that
				71	// would happen in the stream dialect after scheduling: two dispatches known to
				72	// not be running concurrently and operating on the same resources could be made
				73	// in-place.
				74
				75	// CHECK-LABEL: vm.module public @inplace
				76	module @inplace {
				77	// CHECK: vm.rodata private @executable_1_vmvx_bytecode_fb
				78	stream.executable private @executable_1 {
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	79	stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) {
MaheshRavishankar	83b0103	2022-09-09 11:13:30 -0700	[diff] [blame]	80	%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	81	stream.return %x, %y, %z : index, index, index
				82	}
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	83	builtin.module {
Thomas	f0f64ca	2022-03-30 10:38:01 -0700	[diff] [blame]	84	func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding) {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	85	%c0 = arith.constant 0 : index
MaheshRavishankar	4552fda	2022-10-25 16:52:39 -0700	[diff] [blame]	86	%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<4xf32>>
				87	%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
				88	%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readwrite:tensor<4xf32>> -> tensor<4xf32>
				89	%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
Thomas	8f39d27	2022-10-12 14:12:03 -0700	[diff] [blame]	90	%5 = tensor.empty() : tensor<4xf32>
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	91	%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) attrs = {name = "mul.1"} {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	92	^bb0(%arg4: f32, %arg5: f32, %arg6: f32):
				93	%10 = arith.mulf %arg4, %arg5 : f32
				94	linalg.yield %10 : f32
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	95	} -> tensor<4xf32>
MaheshRavishankar	4552fda	2022-10-25 16:52:39 -0700	[diff] [blame]	96	flow.dispatch.tensor.store %6, %0, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<readwrite:tensor<4xf32>>
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	97	return
				98	}
				99	}
				100	}
				101	// CHECK: vm.func private @simple_mul_inplace
Thomas	f0f64ca	2022-03-30 10:38:01 -0700	[diff] [blame]	102	func.func @simple_mul_inplace(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	103	%c4 = arith.constant 4 : index
Ben Vanik	e2a2b2b	2024-08-22 11:56:59 -0700	[diff] [blame]	104	// CHECK: vm.call.variadic @hal.command_buffer.dispatch
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	105	%ret0 = flow.dispatch @executable_1::@dispatch[%c4](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> %arg0
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	106	return %ret0 : tensor<4xf32>
				107	}
				108	} // module
				109
				110	// -----
				111
				112	// The same element-wise multiply but now with dynamic shapes:
				113	// %ret0 = %arg0 * %arg1
				114	//
				115	// This shows how the shape dimensions are captured by the dispatch so that the
				116	// host knows the shapes of the tensors and how the dimensions are passed as
				117	// operands to the executable for association. Once we perform the host/device
				118	// split the association allows tensor.dim ops in the device code to query the
				119	// dynamic dimensions without needing to insert new host -> device transfers.
				120	// Note that because of this explicit association the order of the dispatch
				121	// operands doesn't matter as walking the SSA use-def chain up to the
				122	// stream.binding.subspan allows them to be resolved directly.
				123
				124	// CHECK-LABEL: vm.module public @dynamic
				125	module @dynamic {
				126	// CHECK: vm.rodata private @executable_2_vmvx_bytecode_fb
				127	stream.executable private @executable_2 {
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	128	stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) {
MaheshRavishankar	83b0103	2022-09-09 11:13:30 -0700	[diff] [blame]	129	%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg0
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	130	stream.return %x, %y, %z : index, index, index
				131	}
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	132	builtin.module {
Thomas	f0f64ca	2022-03-30 10:38:01 -0700	[diff] [blame]	133	func.func @dispatch(%arg0: !stream.binding, %arg0_dim0: index, %arg1: !stream.binding, %arg1_dim0: index, %ret0: !stream.binding) {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	134	%c0 = arith.constant 0 : index
MaheshRavishankar	4552fda	2022-10-25 16:52:39 -0700	[diff] [blame]	135	%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg0_dim0}
				136	%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg1_dim0}
				137	%2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg0_dim0}
				138	%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [%arg0_dim0], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg0_dim0} -> tensor<?xf32>
				139	%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [%arg1_dim0], strides = [1] : !flow.dispatch.tensor<readonly:tensor<?xf32>>{%arg1_dim0} -> tensor<?xf32>
Thomas	8f39d27	2022-10-12 14:12:03 -0700	[diff] [blame]	140	%5 = tensor.empty(%arg0_dim0) : tensor<?xf32>
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	141	%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<?xf32>, tensor<?xf32>) outs(%5 : tensor<?xf32>) attrs = {name = "mul.1"} {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	142	^bb0(%arg6: f32, %arg7: f32, %arg8: f32):
				143	%10 = arith.mulf %arg6, %arg7 : f32
				144	linalg.yield %10 : f32
				145	} -> tensor<?xf32>
MaheshRavishankar	4552fda	2022-10-25 16:52:39 -0700	[diff] [blame]	146	flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [%arg0_dim0], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%arg0_dim0}
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	147	return
				148	}
				149	}
				150	}
				151	// CHECK: vm.func private @simple_mul_dynamic
Thomas	f0f64ca	2022-03-30 10:38:01 -0700	[diff] [blame]	152	func.func @simple_mul_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	153	%c0 = arith.constant 0 : index
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	154	// CHECK: vm.call @hal.buffer_view.dim
				155	%arg0_dim0 = tensor.dim %arg0, %c0 : tensor<?xf32>
				156	// CHECK: vm.call @hal.buffer_view.dim
				157	%arg1_dim0 = tensor.dim %arg1, %c0 : tensor<?xf32>
Ben Vanik	e2a2b2b	2024-08-22 11:56:59 -0700	[diff] [blame]	158	// CHECK: vm.call.variadic @hal.command_buffer.dispatch
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	159	%ret0 = flow.dispatch @executable_2::@dispatch[%arg0_dim0](%arg0, %arg0_dim0, %arg1, %arg1_dim0) : (tensor<?xf32>{%arg0_dim0}, index, tensor<?xf32>{%arg1_dim0}, index) -> tensor<?xf32>{%arg0_dim0}
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	160	return %ret0 : tensor<?xf32>
				161	}
				162	} // module
				163
				164	// -----
				165
				166	// This shows the same element-wise multiply but without the first level of
				167	// tiling. This will execute in a single workgroup regardless of tensor size
				168	// (though here it's 4 so it wouldn't be distributed anyway).
				169
				170	// CHECK-LABEL: vm.module public @untiled
				171	module @untiled {
				172	// CHECK: vm.rodata private @executable_3_vmvx_bytecode_fb
				173	stream.executable private @executable_3 {
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	174	stream.executable.export public @dispatch workgroups(%arg0: index) -> (index, index, index) {
				175	%c1 = arith.constant 1 : index
				176	stream.return %c1, %c1, %c1 : index, index, index
				177	}
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	178	builtin.module {
Thomas	f0f64ca	2022-03-30 10:38:01 -0700	[diff] [blame]	179	func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %ret0: !stream.binding) {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	180	%c0 = arith.constant 0 : index
MaheshRavishankar	4552fda	2022-10-25 16:52:39 -0700	[diff] [blame]	181	%0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
				182	%1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xf32>>
				183	%2 = stream.binding.subspan %ret0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
				184	%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
				185	%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<4xf32>
Thomas	8f39d27	2022-10-12 14:12:03 -0700	[diff] [blame]	186	%5 = tensor.empty() : tensor<4xf32>
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	187	%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<4xf32>, tensor<4xf32>) outs(%5 : tensor<4xf32>) {
				188	^bb0(%lhs: f32, %rhs: f32, %out: f32):
				189	%7 = arith.mulf %lhs, %rhs : f32
				190	linalg.yield %7 : f32
				191	} -> tensor<4xf32>
MaheshRavishankar	4552fda	2022-10-25 16:52:39 -0700	[diff] [blame]	192	flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	193	return
				194	}
				195	}
				196	}
				197	// CHECK: vm.func private @simple_mul_untiled
Thomas	f0f64ca	2022-03-30 10:38:01 -0700	[diff] [blame]	198	func.func @simple_mul_untiled(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	199	%c1 = arith.constant 1 : index
MaheshRavishankar	beda464	2022-06-30 18:04:13 -0700	[diff] [blame]	200	%ret0 = flow.dispatch @executable_3::@dispatch[%c1](%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
Ben Vanik	9de6fe0	2022-02-02 18:05:31 -0800	[diff] [blame]	201	return %ret0 : tensor<4xf32>
				202	}
				203	} // module