tests/e2e/encoding/encoding.mlir - 3p/openxla/iree - Git at Google

 //===----------------------------------------------------------------------===//
 // Utility Methods
 //===----------------------------------------------------------------------===//

 func.func private @generate_1D_source_f32(%height : index) -> tensor<?xf32> {
   %init_source = tensor.empty(%height) : tensor<?xf32>
   %source = linalg.generic {
       indexing_maps = [affine_map<(d0) -> (d0)>],
       iterator_types = ["parallel"]}
       outs(%init_source : tensor<?xf32>) {
     ^bb0(%b0 : f32):
       %index = linalg.index 0 : index
       %index_i32 = arith.index_cast %index : index to i32
       %index_f32 = arith.sitofp %index_i32 : i32 to f32
       linalg.yield %index_f32 : f32
   } -> tensor<?xf32>
   // This blocks the fusion for inputs and testing ops.
   %0 = util.optimization_barrier %source : tensor<?xf32>
   %1 = flow.tensor.tie_shape %0 : tensor<?xf32>{%height}
   return %1 : tensor<?xf32>
 }

 func.func private @generate_2D_source_f32(%height : index, %width : index) -> tensor<?x?xf32> {
   %init_source = tensor.empty(%height, %width) : tensor<?x?xf32>
   %source = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
       iterator_types = ["parallel", "parallel"]}
       outs(%init_source : tensor<?x?xf32>) {
     ^bb0(%b0 : f32):
       %outer = linalg.index 0 : index
       %inner = linalg.index 1 : index
       %strided = arith.muli %outer, %width : index
       %linearized = arith.addi %inner, %strided : index
       %linearized_i32 = arith.index_cast %linearized : index to i32
       %linearized_f32 = arith.sitofp %linearized_i32 : i32 to f32
       linalg.yield %linearized_f32 : f32
   } -> tensor<?x?xf32>
   // This blocks the fusion for inputs and testing ops.
   %0 = util.optimization_barrier %source : tensor<?x?xf32>
   %1 = flow.tensor.tie_shape %0 : tensor<?x?xf32>{%height, %width}
   return %1 : tensor<?x?xf32>
 }

 func.func private @generate_3D_source_f32(%height : index, %width : index, %depth : index) -> tensor<?x?x?xf32> {
   %init_source = tensor.empty(%height, %width, %depth) : tensor<?x?x?xf32>
   %wd = arith.muli %width, %depth : index
   %source = linalg.generic {
       indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
       iterator_types = ["parallel", "parallel", "parallel"]}
       outs(%init_source : tensor<?x?x?xf32>) {
     ^bb0(%b0 : f32):
       %h = linalg.index 0 : index
       %w = linalg.index 1 : index
       %d = linalg.index 2 : index
       %strided_h = arith.muli %h, %wd : index
       %strided_w = arith.muli %w, %depth : index
       %strided_wd = arith.addi %strided_w, %d : index
       %linearized = arith.addi %strided_h, %strided_wd : index
       %linearized_i32 = arith.index_cast %linearized : index to i32
       %linearized_f32 = arith.sitofp %linearized_i32 : i32 to f32
       linalg.yield %linearized_f32 : f32
   } -> tensor<?x?x?xf32>
   // This blocks the fusion for inputs and testing ops.
   %0 = util.optimization_barrier %source : tensor<?x?x?xf32>
   %1 = flow.tensor.tie_shape %0 : tensor<?x?x?xf32>{%height, %width, %depth}
   return %1 : tensor<?x?x?xf32>
 }

 //===----------------------------------------------------------------------===//
 // Elementwise materialization tests
 //===----------------------------------------------------------------------===//

 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #bcast_map_1d = affine_map<(d0, d1) -> (d1)>
 #encoding_f32f32f32_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>
 #encoding_f32f32f32_lhs_bcast = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [[#map, #bcast_map_1d], #map1, #map2]>

 func.func @elementwise() {
   %height = arith.constant 129 : index
   %width = arith.constant 255 : index
   %0 = call @generate_2D_source_f32(%height, %width) : (index, index) -> tensor<?x?xf32>
   %source = tensor.cast %0 : tensor<?x?xf32> to tensor<129x255xf32>

   %1 = iree_encoding.set_encoding %source : tensor<129x255xf32> -> tensor<129x255xf32, #encoding_f32f32f32_lhs>
   %2 = tensor.empty() : tensor<129x255xf32, #encoding_f32f32f32_lhs>
   %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1, %1 : tensor<129x255xf32, #encoding_f32f32f32_lhs>, tensor<129x255xf32, #encoding_f32f32f32_lhs>) outs(%2 : tensor<129x255xf32, #encoding_f32f32f32_lhs>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %4 = arith.addf %in, %in_0 : f32
     linalg.yield %4 : f32
   } -> tensor<129x255xf32, #encoding_f32f32f32_lhs>
   %barrier = util.optimization_barrier %3 : tensor<129x255xf32, #encoding_f32f32f32_lhs>
   %5 = iree_encoding.unset_encoding %3 : tensor<129x255xf32, #encoding_f32f32f32_lhs> -> tensor<129x255xf32>

   %expected = arith.addf %source, %source : tensor<129x255xf32>
   check.expect_almost_eq(%5, %expected) : tensor<129x255xf32>
   return
 }

 func.func @elementwise_with_broadcast_2d() {
   %height = arith.constant 128 : index
   %width = arith.constant 64 : index
   %0 = call @generate_2D_source_f32(%height, %width) : (index, index) -> tensor<?x?xf32>
   %source = tensor.cast %0 : tensor<?x?xf32> to tensor<128x64xf32>
   %1 = call @generate_1D_source_f32(%width) : (index) -> tensor<?xf32>
   %bcast = tensor.cast %1 : tensor<?xf32> to tensor<64xf32>

   %2 = iree_encoding.set_encoding %source : tensor<128x64xf32> -> tensor<128x64xf32, #encoding_f32f32f32_lhs>
   %3 = iree_encoding.set_encoding %bcast : tensor<64xf32> -> tensor<64xf32, #encoding_f32f32f32_lhs_bcast>
   %4 = tensor.empty() : tensor<128x64xf32, #encoding_f32f32f32_lhs>
   %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %3 : tensor<128x64xf32, #encoding_f32f32f32_lhs>,tensor<64xf32, #encoding_f32f32f32_lhs_bcast>) outs(%4 : tensor<128x64xf32, #encoding_f32f32f32_lhs>) {
   ^bb0(%in: f32, %b: f32, %out: f32):
     %6 = arith.addf %in, %b : f32
     linalg.yield %6 : f32
   } -> tensor<128x64xf32, #encoding_f32f32f32_lhs>
   %barrier = util.optimization_barrier %5 : tensor<128x64xf32, #encoding_f32f32f32_lhs>
   %6 = iree_encoding.unset_encoding %5 : tensor<128x64xf32, #encoding_f32f32f32_lhs> -> tensor<128x64xf32>

   %init_expected = tensor.empty() : tensor<128x64xf32>
   %expected =  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%source, %bcast : tensor<128x64xf32>, tensor<64xf32>) outs(%init_expected : tensor<128x64xf32>) {
   ^bb0(%in: f32, %b: f32, %out: f32):
     %7 = arith.addf %in, %b : f32
     linalg.yield %7 : f32
   } -> tensor<128x64xf32>
   check.expect_almost_eq(%6, %expected) : tensor<128x64xf32>
   return
 }

 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 #bcast_map_2d = affine_map<(d0, d1, d2) -> (d0, d2)>
 #encoding_3d_f32f32f32_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map3, #map4, #map5]>
 #encoding_3d_f32f32f32_lhs_bcast = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [[#map3, #bcast_map_2d], #map4, #map5]>
 func.func @elementwise_with_broadcast_3d() {
   %height = arith.constant 2 : index
   %width = arith.constant 128 : index
   %depth = arith.constant 64 : index
   %0 = call @generate_3D_source_f32(%height, %width, %depth) : (index, index, index) -> tensor<?x?x?xf32>
   %source = tensor.cast %0 : tensor<?x?x?xf32> to tensor<2x128x64xf32>
   %1 = call @generate_2D_source_f32(%height, %depth) : (index, index) -> tensor<?x?xf32>
   %bcast = tensor.cast %1 : tensor<?x?xf32> to tensor<2x64xf32>

   %2 = iree_encoding.set_encoding %source : tensor<2x128x64xf32> -> tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>
   %3 = iree_encoding.set_encoding %bcast : tensor<2x64xf32> -> tensor<2x64xf32, #encoding_3d_f32f32f32_lhs_bcast>
   %4 = tensor.empty() : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>
   %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %3 : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>, tensor<2x64xf32, #encoding_3d_f32f32f32_lhs_bcast>) outs(%4 : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>) {
   ^bb0(%in: f32, %b: f32, %out: f32):
     %24 = arith.addf %in, %b : f32
     linalg.yield %24 : f32
   } -> tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>
   %barrier = util.optimization_barrier %5 : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>
   %6 = iree_encoding.unset_encoding %5 : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs> -> tensor<2x128x64xf32>

   %init_expected = tensor.empty() : tensor<2x128x64xf32>
   %expected =  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%source, %bcast : tensor<2x128x64xf32>, tensor<2x64xf32>) outs(%init_expected : tensor<2x128x64xf32>) {
   ^bb0(%in: f32, %b: f32, %out: f32):
     %7 = arith.addf %in, %b : f32
     linalg.yield %7 : f32
   } -> tensor<2x128x64xf32>
   check.expect_almost_eq(%6, %expected) : tensor<2x128x64xf32>
   return
 }

 #fusion_encoding_f32f32f32_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>
 #fusion_encoding_f32f32f32_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>
 func.func @unset_encoding_set_encoding_fusion() {
   %height = arith.constant 129 : index
   %width = arith.constant 255 : index
   %cst = arith.constant 1.0 : f32
   %cst_tensor = arith.constant dense<1.0> : tensor<129x255xf32>
   %0 = call @generate_2D_source_f32(%height, %width) : (index, index) -> tensor<?x?xf32>
   %source = tensor.cast %0 : tensor<?x?xf32> to tensor<129x255xf32>

   %1 = iree_encoding.set_encoding %source : tensor<129x255xf32> -> tensor<129x255xf32, #fusion_encoding_f32f32f32_result>
   %src_barrier = util.optimization_barrier %1 : tensor<129x255xf32, #fusion_encoding_f32f32f32_result>

   %dispatch = flow.dispatch.region -> (tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs>) {
     %4 = iree_encoding.unset_encoding %src_barrier : tensor<129x255xf32, #fusion_encoding_f32f32f32_result> -> tensor<129x255xf32>
     %5 = tensor.empty() : tensor<129x255xf32>
     %6 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                         affine_map<(d0, d1) -> (d0, d1)>],
         iterator_types = ["parallel", "parallel"]}
         ins(%4 : tensor<129x255xf32>) outs(%5 : tensor<129x255xf32>) {
     ^bb0(%in: f32, %out: f32):
         %8 = arith.addf %in, %cst : f32
         linalg.yield %8 : f32
     } -> tensor<129x255xf32>
     %7 = iree_encoding.set_encoding %6 : tensor<129x255xf32> -> tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs>
     flow.return %7 : tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs>
   }

   %result_barrier = util.optimization_barrier %dispatch : tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs>
   %2 = iree_encoding.unset_encoding %result_barrier : tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs> -> tensor<129x255xf32>

   %3 = tensor.empty() : tensor<129x255xf32>
   %expected = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                       affine_map<(d0, d1) -> (d0, d1)>],
       iterator_types = ["parallel", "parallel"]}
       ins(%source : tensor<129x255xf32>) outs(%3 : tensor<129x255xf32>) {
   ^bb0(%in: f32, %out: f32):
       %4 = arith.addf %in, %cst : f32
       linalg.yield %4 : f32
   } -> tensor<129x255xf32>
   check.expect_almost_eq(%2, %expected) : tensor<129x255xf32>
   return
 }
	//===----------------------------------------------------------------------===//
	// Utility Methods
	//===----------------------------------------------------------------------===//

	func.func private @generate_1D_source_f32(%height : index) -> tensor<?xf32> {
	%init_source = tensor.empty(%height) : tensor<?xf32>
	%source = linalg.generic {
	indexing_maps = [affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	outs(%init_source : tensor<?xf32>) {
	^bb0(%b0 : f32):
	%index = linalg.index 0 : index
	%index_i32 = arith.index_cast %index : index to i32
	%index_f32 = arith.sitofp %index_i32 : i32 to f32
	linalg.yield %index_f32 : f32
	} -> tensor<?xf32>
	// This blocks the fusion for inputs and testing ops.
	%0 = util.optimization_barrier %source : tensor<?xf32>
	%1 = flow.tensor.tie_shape %0 : tensor<?xf32>{%height}
	return %1 : tensor<?xf32>
	}

	func.func private @generate_2D_source_f32(%height : index, %width : index) -> tensor<?x?xf32> {
	%init_source = tensor.empty(%height, %width) : tensor<?x?xf32>
	%source = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel"]}
	outs(%init_source : tensor<?x?xf32>) {
	^bb0(%b0 : f32):
	%outer = linalg.index 0 : index
	%inner = linalg.index 1 : index
	%strided = arith.muli %outer, %width : index
	%linearized = arith.addi %inner, %strided : index
	%linearized_i32 = arith.index_cast %linearized : index to i32
	%linearized_f32 = arith.sitofp %linearized_i32 : i32 to f32
	linalg.yield %linearized_f32 : f32
	} -> tensor<?x?xf32>
	// This blocks the fusion for inputs and testing ops.
	%0 = util.optimization_barrier %source : tensor<?x?xf32>
	%1 = flow.tensor.tie_shape %0 : tensor<?x?xf32>{%height, %width}
	return %1 : tensor<?x?xf32>
	}

	func.func private @generate_3D_source_f32(%height : index, %width : index, %depth : index) -> tensor<?x?x?xf32> {
	%init_source = tensor.empty(%height, %width, %depth) : tensor<?x?x?xf32>
	%wd = arith.muli %width, %depth : index
	%source = linalg.generic {
	indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
	iterator_types = ["parallel", "parallel", "parallel"]}
	outs(%init_source : tensor<?x?x?xf32>) {
	^bb0(%b0 : f32):
	%h = linalg.index 0 : index
	%w = linalg.index 1 : index
	%d = linalg.index 2 : index
	%strided_h = arith.muli %h, %wd : index
	%strided_w = arith.muli %w, %depth : index
	%strided_wd = arith.addi %strided_w, %d : index
	%linearized = arith.addi %strided_h, %strided_wd : index
	%linearized_i32 = arith.index_cast %linearized : index to i32
	%linearized_f32 = arith.sitofp %linearized_i32 : i32 to f32
	linalg.yield %linearized_f32 : f32
	} -> tensor<?x?x?xf32>
	// This blocks the fusion for inputs and testing ops.
	%0 = util.optimization_barrier %source : tensor<?x?x?xf32>
	%1 = flow.tensor.tie_shape %0 : tensor<?x?x?xf32>{%height, %width, %depth}
	return %1 : tensor<?x?x?xf32>
	}

	//===----------------------------------------------------------------------===//
	// Elementwise materialization tests
	//===----------------------------------------------------------------------===//

	#map = affine_map<(d0, d1, d2) -> (d0, d2)>
	#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
	#bcast_map_1d = affine_map<(d0, d1) -> (d1)>
	#encoding_f32f32f32_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>
	#encoding_f32f32f32_lhs_bcast = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [[#map, #bcast_map_1d], #map1, #map2]>

	func.func @elementwise() {
	%height = arith.constant 129 : index
	%width = arith.constant 255 : index
	%0 = call @generate_2D_source_f32(%height, %width) : (index, index) -> tensor<?x?xf32>
	%source = tensor.cast %0 : tensor<?x?xf32> to tensor<129x255xf32>

	%1 = iree_encoding.set_encoding %source : tensor<129x255xf32> -> tensor<129x255xf32, #encoding_f32f32f32_lhs>
	%2 = tensor.empty() : tensor<129x255xf32, #encoding_f32f32f32_lhs>
	%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1, %1 : tensor<129x255xf32, #encoding_f32f32f32_lhs>, tensor<129x255xf32, #encoding_f32f32f32_lhs>) outs(%2 : tensor<129x255xf32, #encoding_f32f32f32_lhs>) {
	^bb0(%in: f32, %in_0: f32, %out: f32):
	%4 = arith.addf %in, %in_0 : f32
	linalg.yield %4 : f32
	} -> tensor<129x255xf32, #encoding_f32f32f32_lhs>
	%barrier = util.optimization_barrier %3 : tensor<129x255xf32, #encoding_f32f32f32_lhs>
	%5 = iree_encoding.unset_encoding %3 : tensor<129x255xf32, #encoding_f32f32f32_lhs> -> tensor<129x255xf32>

	%expected = arith.addf %source, %source : tensor<129x255xf32>
	check.expect_almost_eq(%5, %expected) : tensor<129x255xf32>
	return
	}

	func.func @elementwise_with_broadcast_2d() {
	%height = arith.constant 128 : index
	%width = arith.constant 64 : index
	%0 = call @generate_2D_source_f32(%height, %width) : (index, index) -> tensor<?x?xf32>
	%source = tensor.cast %0 : tensor<?x?xf32> to tensor<128x64xf32>
	%1 = call @generate_1D_source_f32(%width) : (index) -> tensor<?xf32>
	%bcast = tensor.cast %1 : tensor<?xf32> to tensor<64xf32>

	%2 = iree_encoding.set_encoding %source : tensor<128x64xf32> -> tensor<128x64xf32, #encoding_f32f32f32_lhs>
	%3 = iree_encoding.set_encoding %bcast : tensor<64xf32> -> tensor<64xf32, #encoding_f32f32f32_lhs_bcast>
	%4 = tensor.empty() : tensor<128x64xf32, #encoding_f32f32f32_lhs>
	%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2, %3 : tensor<128x64xf32, #encoding_f32f32f32_lhs>,tensor<64xf32, #encoding_f32f32f32_lhs_bcast>) outs(%4 : tensor<128x64xf32, #encoding_f32f32f32_lhs>) {
	^bb0(%in: f32, %b: f32, %out: f32):
	%6 = arith.addf %in, %b : f32
	linalg.yield %6 : f32
	} -> tensor<128x64xf32, #encoding_f32f32f32_lhs>
	%barrier = util.optimization_barrier %5 : tensor<128x64xf32, #encoding_f32f32f32_lhs>
	%6 = iree_encoding.unset_encoding %5 : tensor<128x64xf32, #encoding_f32f32f32_lhs> -> tensor<128x64xf32>

	%init_expected = tensor.empty() : tensor<128x64xf32>
	%expected = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%source, %bcast : tensor<128x64xf32>, tensor<64xf32>) outs(%init_expected : tensor<128x64xf32>) {
	^bb0(%in: f32, %b: f32, %out: f32):
	%7 = arith.addf %in, %b : f32
	linalg.yield %7 : f32
	} -> tensor<128x64xf32>
	check.expect_almost_eq(%6, %expected) : tensor<128x64xf32>
	return
	}

	#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
	#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
	#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
	#bcast_map_2d = affine_map<(d0, d1, d2) -> (d0, d2)>
	#encoding_3d_f32f32f32_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map3, #map4, #map5]>
	#encoding_3d_f32f32f32_lhs_bcast = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [[#map3, #bcast_map_2d], #map4, #map5]>
	func.func @elementwise_with_broadcast_3d() {
	%height = arith.constant 2 : index
	%width = arith.constant 128 : index
	%depth = arith.constant 64 : index
	%0 = call @generate_3D_source_f32(%height, %width, %depth) : (index, index, index) -> tensor<?x?x?xf32>
	%source = tensor.cast %0 : tensor<?x?x?xf32> to tensor<2x128x64xf32>
	%1 = call @generate_2D_source_f32(%height, %depth) : (index, index) -> tensor<?x?xf32>
	%bcast = tensor.cast %1 : tensor<?x?xf32> to tensor<2x64xf32>

	%2 = iree_encoding.set_encoding %source : tensor<2x128x64xf32> -> tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>
	%3 = iree_encoding.set_encoding %bcast : tensor<2x64xf32> -> tensor<2x64xf32, #encoding_3d_f32f32f32_lhs_bcast>
	%4 = tensor.empty() : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>
	%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %3 : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>, tensor<2x64xf32, #encoding_3d_f32f32f32_lhs_bcast>) outs(%4 : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>) {
	^bb0(%in: f32, %b: f32, %out: f32):
	%24 = arith.addf %in, %b : f32
	linalg.yield %24 : f32
	} -> tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>
	%barrier = util.optimization_barrier %5 : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs>
	%6 = iree_encoding.unset_encoding %5 : tensor<2x128x64xf32, #encoding_3d_f32f32f32_lhs> -> tensor<2x128x64xf32>

	%init_expected = tensor.empty() : tensor<2x128x64xf32>
	%expected = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%source, %bcast : tensor<2x128x64xf32>, tensor<2x64xf32>) outs(%init_expected : tensor<2x128x64xf32>) {
	^bb0(%in: f32, %b: f32, %out: f32):
	%7 = arith.addf %in, %b : f32
	linalg.yield %7 : f32
	} -> tensor<2x128x64xf32>
	check.expect_almost_eq(%6, %expected) : tensor<2x128x64xf32>
	return
	}

	#fusion_encoding_f32f32f32_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>
	#fusion_encoding_f32f32f32_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2]>
	func.func @unset_encoding_set_encoding_fusion() {
	%height = arith.constant 129 : index
	%width = arith.constant 255 : index
	%cst = arith.constant 1.0 : f32
	%cst_tensor = arith.constant dense<1.0> : tensor<129x255xf32>
	%0 = call @generate_2D_source_f32(%height, %width) : (index, index) -> tensor<?x?xf32>
	%source = tensor.cast %0 : tensor<?x?xf32> to tensor<129x255xf32>

	%1 = iree_encoding.set_encoding %source : tensor<129x255xf32> -> tensor<129x255xf32, #fusion_encoding_f32f32f32_result>
	%src_barrier = util.optimization_barrier %1 : tensor<129x255xf32, #fusion_encoding_f32f32f32_result>

	%dispatch = flow.dispatch.region -> (tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs>) {
	%4 = iree_encoding.unset_encoding %src_barrier : tensor<129x255xf32, #fusion_encoding_f32f32f32_result> -> tensor<129x255xf32>
	%5 = tensor.empty() : tensor<129x255xf32>
	%6 = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
	affine_map<(d0, d1) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel"]}
	ins(%4 : tensor<129x255xf32>) outs(%5 : tensor<129x255xf32>) {
	^bb0(%in: f32, %out: f32):
	%8 = arith.addf %in, %cst : f32
	linalg.yield %8 : f32
	} -> tensor<129x255xf32>
	%7 = iree_encoding.set_encoding %6 : tensor<129x255xf32> -> tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs>
	flow.return %7 : tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs>
	}

	%result_barrier = util.optimization_barrier %dispatch : tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs>
	%2 = iree_encoding.unset_encoding %result_barrier : tensor<129x255xf32, #fusion_encoding_f32f32f32_lhs> -> tensor<129x255xf32>

	%3 = tensor.empty() : tensor<129x255xf32>
	%expected = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
	affine_map<(d0, d1) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel"]}
	ins(%source : tensor<129x255xf32>) outs(%3 : tensor<129x255xf32>) {
	^bb0(%in: f32, %out: f32):
	%4 = arith.addf %in, %cst : f32
	linalg.yield %4 : f32
	} -> tensor<129x255xf32>
	check.expect_almost_eq(%2, %expected) : tensor<129x255xf32>
	return
	}