tests/transform_dialect/cuda/softmax_codegen_spec.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-opt %s

 // Codegen
 transform.with_pdl_patterns {
 ^bb0(%arg0: !pdl.operation):
   transform.structured.canonicalized_sequence %arg0 failures(propagate) {
   ^bb1(%variant_op: !pdl.operation):
     // First level of tiling + fusion parallelizes to blocks.
     // The mapping  to block ids can only happen after bufferization atm
     %root = transform.structured.match interface{LinalgOp}
       attributes{iterator_types = ["parallel", "parallel", "parallel"]} in %variant_op
     %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
     %red = transform.structured.match interface{LinalgOp}
       attributes{iterator_types = ["parallel", "parallel", "reduction"]} in %variant_op
     %not_root = merge_handles %fill, %red
     %foreach_thread, %tiled_generic =
       transform.structured.tile_to_foreach_thread_op %root tile_sizes [1, 4]
     transform.structured.fuse_into_containing_op %not_root into %foreach_thread

     // Second level of tiling + fusion parallelizes to threads.
     // Leaving the reduction untiled on threadIdx.x makes it sequential on
     // threadIdx.x. After distribution, predication by if (threadIdx.x == 0) is
     // introduced and opportunities for distributing vector ops across warps
     // appear.
     %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op
     %reduction_linalg = transform.structured.match ops{["linalg.generic"]}
       attributes{iterator_types = ["parallel", "parallel", "reduction"]} in %variant_op
     %parallel_linalg = transform.structured.match ops{["linalg.generic"]}
       attributes{iterator_types = ["parallel", "parallel", "parallel"]} in %variant_op
     %foreach_thread_reduction, %tiled_reduction_generic =
       transform.structured.tile_to_foreach_thread_op %reduction_linalg tile_sizes [1, 1]
         (mapped to dims [2, 1, 0])
     // TODO: this fusion currently does not happen properly, this is related to the clone
     // behavior when fusing into scf.foreach_thread.
     // Once fixed we'll be able to fuse.
     // Fusion will save us one roundtrip to memory.
     // transform.structured.fuse_into_containing_op %fill_linalg into %foreach_thread_reduction
     transform.structured.tile_to_foreach_thread_op %parallel_linalg num_threads [1, 4, 32]
         (mapped to dims [2, 1, 0])


     // Inability to tile reductions to scf.foreach_thread has 2 implications:
     //   1. since no scf.foreach_thread is present, no gpu.barrier is added.
     //      This should be fixed independently: ops that are not nested in an scf.foreach_thread
     //      should have a gpu.barrier. Later needs to be complemented by a barrier
     //      removal pass.
     //   2. Similarly, needs to be predicated under an if threadIx == 0 to avoid
     //      multiple threads updating the buffer inplace once bufferized.
     //
     // Instead, we can vectorize and go to vector SSA values that sidestep these
     // issues.
     // Everyone will race to the write while still computing the same value.
     //
     // That is still not good enough because we need to predicate this in order
     // to enable the parallel reduction on warps.
     %func = transform.structured.match ops{["func.func"]} in %variant_op
     %funcx = transform.iree.apply_patterns %func { rank_reducing }
     transform.structured.vectorize %funcx

     // Bufferization is necessary for:
     //   1. lowering scf.foreach_thread to workgroup (block level parallelism)
     //   2. lowering scf.foreach_thread to gpu (thread level parallelism)
     //   3. introducing predication (due to 1. + 2.) which enables rewriting to
     //      warp_execute_on_lane_0 and later vector distribution.
     %variant_op_2 = transform.iree.bufferize { target_gpu } %variant_op

     %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_2
     %func_3 = transform.iree.foreach_thread_to_workgroup %func_2
     transform.iree.map_nested_foreach_thread_to_gpu_threads %func_3
       { workgroup_size = [32, 4, 1] }

     %end_func = transform.structured.match ops{["func.func"]} in %variant_op_2
     %end_func_2 = transform.iree.apply_patterns %end_func { rank_reducing }

     // Vector distribution needs to happen on buffers.
     %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_2
     %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
     transform.iree.vector.warp_distribute %end_func_2
   }
 }
	// RUN: iree-opt %s

	// Codegen
	transform.with_pdl_patterns {
	^bb0(%arg0: !pdl.operation):
	transform.structured.canonicalized_sequence %arg0 failures(propagate) {
	^bb1(%variant_op: !pdl.operation):
	// First level of tiling + fusion parallelizes to blocks.
	// The mapping to block ids can only happen after bufferization atm
	%root = transform.structured.match interface{LinalgOp}
	attributes{iterator_types = ["parallel", "parallel", "parallel"]} in %variant_op
	%fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
	%red = transform.structured.match interface{LinalgOp}
	attributes{iterator_types = ["parallel", "parallel", "reduction"]} in %variant_op
	%not_root = merge_handles %fill, %red
	%foreach_thread, %tiled_generic =
	transform.structured.tile_to_foreach_thread_op %root tile_sizes [1, 4]
	transform.structured.fuse_into_containing_op %not_root into %foreach_thread

	// Second level of tiling + fusion parallelizes to threads.
	// Leaving the reduction untiled on threadIdx.x makes it sequential on
	// threadIdx.x. After distribution, predication by if (threadIdx.x == 0) is
	// introduced and opportunities for distributing vector ops across warps
	// appear.
	%fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op
	%reduction_linalg = transform.structured.match ops{["linalg.generic"]}
	attributes{iterator_types = ["parallel", "parallel", "reduction"]} in %variant_op
	%parallel_linalg = transform.structured.match ops{["linalg.generic"]}
	attributes{iterator_types = ["parallel", "parallel", "parallel"]} in %variant_op
	%foreach_thread_reduction, %tiled_reduction_generic =
	transform.structured.tile_to_foreach_thread_op %reduction_linalg tile_sizes [1, 1]
	(mapped to dims [2, 1, 0])
	// TODO: this fusion currently does not happen properly, this is related to the clone
	// behavior when fusing into scf.foreach_thread.
	// Once fixed we'll be able to fuse.
	// Fusion will save us one roundtrip to memory.
	// transform.structured.fuse_into_containing_op %fill_linalg into %foreach_thread_reduction
	transform.structured.tile_to_foreach_thread_op %parallel_linalg num_threads [1, 4, 32]
	(mapped to dims [2, 1, 0])


	// Inability to tile reductions to scf.foreach_thread has 2 implications:
	// 1. since no scf.foreach_thread is present, no gpu.barrier is added.
	// This should be fixed independently: ops that are not nested in an scf.foreach_thread
	// should have a gpu.barrier. Later needs to be complemented by a barrier
	// removal pass.
	// 2. Similarly, needs to be predicated under an if threadIx == 0 to avoid
	// multiple threads updating the buffer inplace once bufferized.
	//
	// Instead, we can vectorize and go to vector SSA values that sidestep these
	// issues.
	// Everyone will race to the write while still computing the same value.
	//
	// That is still not good enough because we need to predicate this in order
	// to enable the parallel reduction on warps.
	%func = transform.structured.match ops{["func.func"]} in %variant_op
	%funcx = transform.iree.apply_patterns %func { rank_reducing }
	transform.structured.vectorize %funcx

	// Bufferization is necessary for:
	// 1. lowering scf.foreach_thread to workgroup (block level parallelism)
	// 2. lowering scf.foreach_thread to gpu (thread level parallelism)
	// 3. introducing predication (due to 1. + 2.) which enables rewriting to
	// warp_execute_on_lane_0 and later vector distribution.
	%variant_op_2 = transform.iree.bufferize { target_gpu } %variant_op

	%func_2 = transform.structured.match ops{["func.func"]} in %variant_op_2
	%func_3 = transform.iree.foreach_thread_to_workgroup %func_2
	transform.iree.map_nested_foreach_thread_to_gpu_threads %func_3
	{ workgroup_size = [32, 4, 1] }

	%end_func = transform.structured.match ops{["func.func"]} in %variant_op_2
	%end_func_2 = transform.iree.apply_patterns %end_func { rank_reducing }

	// Vector distribution needs to happen on buffers.
	%if_op = transform.structured.match ops{["scf.if"]} in %variant_op_2
	%warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
	transform.iree.vector.warp_distribute %end_func_2
	}
	}