Blame - tests/transform_dialect/cuda/reduction_eltwise.mlir - 3p/openxla/iree

blob: 94245b5d2a8720459622066e2dc1a5e56c2df8f3 [file] [log] [blame]

Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	1	!in_tensor_t = tensor<8x64xf32>
				2	!out_tensor_t = tensor<8xf32>
				3
				4	func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
				5	%cst = arith.constant -0.000000e+00 : f32
				6
				7	%0 = tensor.empty() : !out_tensor_t
				8	%1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t
				9	%5 = linalg.generic {
				10	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
				11	affine_map<(d0, d1) -> (d0)>],
				12	iterator_types = ["parallel", "reduction"]}
				13	ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) {
				14	^bb0(%arg3: f32, %arg4: f32):
				15	%4 = arith.addf %arg3, %arg4 : f32
				16	linalg.yield %4 : f32
				17	} -> !out_tensor_t
				18
				19	%6 = tensor.empty() : !out_tensor_t
				20	%7 = linalg.generic {
				21	indexing_maps = [affine_map<(d0) -> (d0)>,
				22	affine_map<(d0) -> (d0)>],
				23	iterator_types = ["parallel"]}
Ben Vanik	f65c5cb	2023-02-01 11:02:10 -0800	[diff] [blame]	24	ins(%5 : !out_tensor_t) outs(%6 : !out_tensor_t) {
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	25	^bb0(%arg3: f32, %arg4: f32):
				26	%4 = math.sqrt %arg3 : f32
				27	linalg.yield %4 : f32
				28	} -> !out_tensor_t
				29	return %7 : !out_tensor_t
				30	}
				31
				32	// RUN: iree-opt %s --iree-hal-target-backends=cuda \
				33	// RUN: --iree-abi-transformation-pipeline \
				34	// RUN: --iree-flow-transformation-pipeline \
				35	// RUN: --iree-stream-transformation-pipeline \
				36	// RUN: --iree-hal-configuration-pipeline \| \
				37	// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target)))' \
Nicolas Vasilache	fafde87	2023-01-26 22:25:37 +0100	[diff] [blame]	38	// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	39	// RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/reduction_eltwise_codegen_spec.mlir \| \
				40	// RUN: FileCheck %s --check-prefix=CHECK
				41
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	42	// RUN: iree-compile %s --iree-hal-target-backends=cuda \
Nicolas Vasilache	fafde87	2023-01-26 22:25:37 +0100	[diff] [blame]	43	// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	44	// RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/reduction_eltwise_codegen_spec.mlir \| \
Ben Vanik	f65c5cb	2023-02-01 11:02:10 -0800	[diff] [blame]	45	// RUN: iree-run-module --function=reduce --device=cuda --input="8x64xf32=1" \|\
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	46	// RUN: FileCheck %s --check-prefix=EXEC
				47
				48	/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm.
Nicolas Vasilache	fafde87	2023-01-26 22:25:37 +0100	[diff] [blame]	49	// RUN: iree-compile %s --iree-hal-target-backends=cuda \| \
Ben Vanik	f65c5cb	2023-02-01 11:02:10 -0800	[diff] [blame]	50	// RUN: iree-run-module --function=reduce --device=cuda --input="8x64xf32=1" \|\
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	51	// RUN: FileCheck %s --check-prefix=EXEC
				52
				53	// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
				54	// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	55	// CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
MaheshRavishankar	837151d	2023-01-20 15:03:01 -0800	[diff] [blame]	56	// CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32, #gpu.address_space<workgroup>>
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	57	// CHECK-DAG: %[[TIDX:.]] = gpu.thread_id x
				58	// CHECK-DAG: %[[TIDY:.]] = gpu.thread_id y
Han-Chung Wang	badd598	2023-01-10 22:27:15 -0800	[diff] [blame]	59	// CHECK-DAG: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	60
				61	// Distributed reduction: everyone loads then 5 xor + addf expected
Thomas	190885e	2023-01-20 17:59:18 -0800	[diff] [blame]	62	// CHECK: vector.transfer_read %{{.*}}[%[[workgroup_id_x]], %[[TIDY]], %[[TIDX]]]
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	63	// CHECK-COUNT-5: gpu.shuffle xor{{.}}{{[[:space:]].}}{{.*}} arith.addf
				64
				65	// CHECK: %[[RES:.]] = arith.addf %{{.}}
				66
				67	// CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	68	// CHECK: scf.if %[[CONDXIS0]]
Thomas	190885e	2023-01-20 17:59:18 -0800	[diff] [blame]	69	// CHECK: vector.transfer_write %[[RES_VEC]], %[[SHMEM_ALLOC]][%[[C0]], %[[TIDY]]]
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	70	// CHECK: gpu.barrier
				71
				72	// Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
				73	// It should contain the fused elementwise operation.
				74	// CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
				75	// TODO: cond eq 0 and cond ult 1 do not CSE atm.
				76	// CHECK: %[[CONXANDYARE0:.]] = arith.andi %{{.}}, %[[CONDYIS0]] : i1
				77	// CHECK: scf.if %[[CONXANDYARE0]] {
				78	// CHECK: vector.transfer_read
				79	// CHECK: vector.reduction <add>
				80	// CHECK: math.sqrt
				81	// CHECK: vector.transfer_write
				82	// CHECK: gpu.barrier
MaheshRavishankar	837151d	2023-01-20 15:03:01 -0800	[diff] [blame]	83	// CHECK: memref.dealloc %[[SHMEM_ALLOC]] : memref<1x2xf32, #gpu.address_space<workgroup>>
Nicolas Vasilache	0573f4f	2022-12-06 22:03:15 +0100	[diff] [blame]	84
				85
				86	// EXEC: result[0]: hal.buffer_view
				87	// EXEC-NEXT: 8xf32=8 8 8 8 8 8 8 8