tests/e2e/regression/layernorm.mlir - 3p/openxla/iree - Git at Google

 // y = gamma * (x-mean(x)) / rsqrt(var(x) + epsilon) + beta
 // Setting gamma = 1.0 and beta = 0.0 for simplicity.
 //
 // Generated from this TOSA input:
 //
 // func.func @layernorm() {
 //   %x = util.unfoldable_constant dense<5.0> : tensor<128x384xf32>
 //   %c384 = util.unfoldable_constant dense<384.0> : tensor<128x1xf32>
 //   %sum = tosa.reduce_sum %x {axis = 1 : i64} : (tensor<128x384xf32>) -> tensor<128x1xf32>
 //   %r384 = tosa.reciprocal %c384 : (tensor<128x1xf32>) -> tensor<128x1xf32>
 //   %mean = tosa.mul %sum, %r384 {shift = 0 : i8} : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
 //   %x_sub_mean = tosa.sub %x, %mean : (tensor<128x384xf32>, tensor<128x1xf32>) -> tensor<128x384xf32>
 //   %square = tosa.mul %x_sub_mean, %x_sub_mean {shift = 0 : i8} : (tensor<128x384xf32>, tensor<128x384xf32>) -> tensor<128x384xf32>
 //   %square_sum = tosa.reduce_sum %square {axis = 1 : i64} : (tensor<128x384xf32>) -> tensor<128x1xf32>
 //   %variance = tosa.mul %square_sum, %r384 {shift = 0 : i8} : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
 //   %epsilon = util.unfoldable_constant dense<9.99999996E-13> : tensor<128x1xf32>
 //   %var_eps = tosa.add %variance, %epsilon : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
 //   %rsigma = tosa.rsqrt %var_eps : (tensor<128x1xf32>) -> tensor<128x1xf32>
 //   %norm = tosa.mul %x_sub_mean, %rsigma {shift = 0 : i8} : (tensor<128x384xf32>, tensor<128x1xf32>) -> tensor<128x384xf32>
 //   check.expect_almost_eq_const(%norm, dense<0.0> : tensor<128x384xf32>) : tensor<128x384xf32>
 //   return
 // }

 func.func @layernorm() {
   %cst = arith.constant 1.000000e+00 : f32
   %cst_0 = arith.constant 0.000000e+00 : f32
   %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x384xf32>
   %cst_2 = arith.constant dense<9.99999996E-13> : tensor<128x1xf32>
   %cst_3 = arith.constant dense<3.840000e+02> : tensor<128x1xf32>
   %cst_4 = arith.constant dense<5.000000e+00> : tensor<128x384xf32>
   %0 = util.optimization_barrier %cst_4 : tensor<128x384xf32>
   %1 = util.optimization_barrier %cst_3 : tensor<128x1xf32>
   %2 = tensor.empty() : tensor<128xf32>
   %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<128xf32>) -> tensor<128xf32>
   %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<128x384xf32>) outs(%3 : tensor<128xf32>) {
   ^bb0(%arg0: f32, %arg1: f32):
     %15 = arith.addf %arg0, %arg1 : f32
     linalg.yield %15 : f32
   } -> tensor<128xf32>
   %5 = tensor.empty() : tensor<128x1xf32>
   %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<128x1xf32>) outs(%5 : tensor<128x1xf32>) {
   ^bb0(%arg0: f32, %arg1: f32):
     %15 = arith.divf %cst, %arg0 : f32
     linalg.yield %15 : f32
   } -> tensor<128x1xf32>
   %7 = tensor.collapse_shape %6 [[0, 1]] : tensor<128x1xf32> into tensor<128xf32>
   %8 = tensor.empty() : tensor<128x384xf32>
   %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0, %4, %7 : tensor<128x384xf32>, tensor<128xf32>, tensor<128xf32>) outs(%8 : tensor<128x384xf32>) {
   ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):
     %15 = arith.mulf %arg1, %arg2 : f32
     %16 = arith.subf %arg0, %15 : f32
     linalg.yield %16 : f32
   } -> tensor<128x384xf32>
   %10 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<128xf32>) -> tensor<128xf32>
   %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9 : tensor<128x384xf32>) outs(%10 : tensor<128xf32>) {
   ^bb0(%arg0: f32, %arg1: f32):
     %15 = arith.mulf %arg0, %arg0 : f32
     %16 = arith.addf %15, %arg1 : f32
     linalg.yield %16 : f32
   } -> tensor<128xf32>
   %12 = util.optimization_barrier %cst_2 : tensor<128x1xf32>
   %13 = tensor.collapse_shape %12 [[0, 1]] : tensor<128x1xf32> into tensor<128xf32>
   %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %11, %7, %13 : tensor<128x384xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) outs(%8 : tensor<128x384xf32>) {
   ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
     %15 = arith.mulf %arg1, %arg2 : f32
     %16 = arith.addf %15, %arg3 : f32
     %17 = math.rsqrt %16 : f32
     %18 = arith.mulf %arg0, %17 : f32
     linalg.yield %18 : f32
   } -> tensor<128x384xf32>
   check.expect_almost_eq(%14, %cst_1) : tensor<128x384xf32>
   return
 }

 func.func @layernorm_dynamic() {
   %cst = arith.constant 1.000000e+00 : f32
   %cst_0 = arith.constant 0.000000e+00 : f32
   %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x384xf32>
   %cst_2 = flow.tensor.dynamic_constant dense<9.99999996E-13> : tensor<128x1xf32> -> tensor<?x1xf32>
   %cst_3 = flow.tensor.dynamic_constant dense<3.840000e+02> : tensor<128x1xf32> -> tensor<?x1xf32>
   %cst_4 = flow.tensor.dynamic_constant dense<5.000000e+00> : tensor<128x384xf32> -> tensor<?x?xf32>
   %c_0_index = arith.constant 0 : index
   %c_1_index = arith.constant 1 : index
   %dim_0 = tensor.dim %cst_4, %c_0_index : tensor<?x?xf32>
   %dim_1 = tensor.dim %cst_4, %c_1_index : tensor<?x?xf32>
   %2 = tensor.empty(%dim_0) : tensor<?xf32>
   %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<?xf32>) -> tensor<?xf32>
   %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%cst_4 : tensor<?x?xf32>) outs(%3 : tensor<?xf32>) {
   ^bb0(%arg0: f32, %arg1: f32):
     %15 = arith.addf %arg0, %arg1 : f32
     linalg.yield %15 : f32
   } -> tensor<?xf32>
   %5 = tensor.empty(%dim_0) : tensor<?x1xf32>
   %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_3 : tensor<?x1xf32>) outs(%5 : tensor<?x1xf32>) {
   ^bb0(%arg0: f32, %arg1: f32):
     %15 = arith.divf %cst, %arg0 : f32
     linalg.yield %15 : f32
   } -> tensor<?x1xf32>
   %7 = tensor.collapse_shape %6 [[0, 1]] : tensor<?x1xf32> into tensor<?xf32>
   %8 = tensor.empty(%dim_0, %dim_1) : tensor<?x?xf32>
   %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_4, %4, %7 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?x?xf32>) {
   ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):
     %15 = arith.mulf %arg1, %arg2 : f32
     %16 = arith.subf %arg0, %15 : f32
     linalg.yield %16 : f32
   } -> tensor<?x?xf32>
   %10 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<?xf32>) -> tensor<?xf32>
   %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9 : tensor<?x?xf32>) outs(%10 : tensor<?xf32>) {
   ^bb0(%arg0: f32, %arg1: f32):
     %15 = arith.mulf %arg0, %arg0 : f32
     %16 = arith.addf %15, %arg1 : f32
     linalg.yield %16 : f32
   } -> tensor<?xf32>
   %13 = tensor.collapse_shape %cst_2 [[0, 1]] : tensor<?x1xf32> into tensor<?xf32>
   %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %11, %7, %13 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?x?xf32>) {
   ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
     %15 = arith.mulf %arg1, %arg2 : f32
     %16 = arith.addf %15, %arg3 : f32
     %17 = math.rsqrt %16 : f32
     %18 = arith.mulf %arg0, %17 : f32
     linalg.yield %18 : f32
   } -> tensor<?x?xf32>
   %result = tensor.cast %14 : tensor<?x?xf32> to tensor<128x384xf32>
   check.expect_almost_eq(%result, %cst_1) : tensor<128x384xf32>
   return
 }
	// y = gamma * (x-mean(x)) / rsqrt(var(x) + epsilon) + beta
	// Setting gamma = 1.0 and beta = 0.0 for simplicity.
	//
	// Generated from this TOSA input:
	//
	// func.func @layernorm() {
	// %x = util.unfoldable_constant dense<5.0> : tensor<128x384xf32>
	// %c384 = util.unfoldable_constant dense<384.0> : tensor<128x1xf32>
	// %sum = tosa.reduce_sum %x {axis = 1 : i64} : (tensor<128x384xf32>) -> tensor<128x1xf32>
	// %r384 = tosa.reciprocal %c384 : (tensor<128x1xf32>) -> tensor<128x1xf32>
	// %mean = tosa.mul %sum, %r384 {shift = 0 : i8} : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
	// %x_sub_mean = tosa.sub %x, %mean : (tensor<128x384xf32>, tensor<128x1xf32>) -> tensor<128x384xf32>
	// %square = tosa.mul %x_sub_mean, %x_sub_mean {shift = 0 : i8} : (tensor<128x384xf32>, tensor<128x384xf32>) -> tensor<128x384xf32>
	// %square_sum = tosa.reduce_sum %square {axis = 1 : i64} : (tensor<128x384xf32>) -> tensor<128x1xf32>
	// %variance = tosa.mul %square_sum, %r384 {shift = 0 : i8} : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
	// %epsilon = util.unfoldable_constant dense<9.99999996E-13> : tensor<128x1xf32>
	// %var_eps = tosa.add %variance, %epsilon : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
	// %rsigma = tosa.rsqrt %var_eps : (tensor<128x1xf32>) -> tensor<128x1xf32>
	// %norm = tosa.mul %x_sub_mean, %rsigma {shift = 0 : i8} : (tensor<128x384xf32>, tensor<128x1xf32>) -> tensor<128x384xf32>
	// check.expect_almost_eq_const(%norm, dense<0.0> : tensor<128x384xf32>) : tensor<128x384xf32>
	// return
	// }

	func.func @layernorm() {
	%cst = arith.constant 1.000000e+00 : f32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%cst_1 = arith.constant dense<0.000000e+00> : tensor<128x384xf32>
	%cst_2 = arith.constant dense<9.99999996E-13> : tensor<128x1xf32>
	%cst_3 = arith.constant dense<3.840000e+02> : tensor<128x1xf32>
	%cst_4 = arith.constant dense<5.000000e+00> : tensor<128x384xf32>
	%0 = util.optimization_barrier %cst_4 : tensor<128x384xf32>
	%1 = util.optimization_barrier %cst_3 : tensor<128x1xf32>
	%2 = tensor.empty() : tensor<128xf32>
	%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<128xf32>) -> tensor<128xf32>
	%4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<128x384xf32>) outs(%3 : tensor<128xf32>) {
	^bb0(%arg0: f32, %arg1: f32):
	%15 = arith.addf %arg0, %arg1 : f32
	linalg.yield %15 : f32
	} -> tensor<128xf32>
	%5 = tensor.empty() : tensor<128x1xf32>
	%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<128x1xf32>) outs(%5 : tensor<128x1xf32>) {
	^bb0(%arg0: f32, %arg1: f32):
	%15 = arith.divf %cst, %arg0 : f32
	linalg.yield %15 : f32
	} -> tensor<128x1xf32>
	%7 = tensor.collapse_shape %6 [[0, 1]] : tensor<128x1xf32> into tensor<128xf32>
	%8 = tensor.empty() : tensor<128x384xf32>
	%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0, %4, %7 : tensor<128x384xf32>, tensor<128xf32>, tensor<128xf32>) outs(%8 : tensor<128x384xf32>) {
	^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):
	%15 = arith.mulf %arg1, %arg2 : f32
	%16 = arith.subf %arg0, %15 : f32
	linalg.yield %16 : f32
	} -> tensor<128x384xf32>
	%10 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<128xf32>) -> tensor<128xf32>
	%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9 : tensor<128x384xf32>) outs(%10 : tensor<128xf32>) {
	^bb0(%arg0: f32, %arg1: f32):
	%15 = arith.mulf %arg0, %arg0 : f32
	%16 = arith.addf %15, %arg1 : f32
	linalg.yield %16 : f32
	} -> tensor<128xf32>
	%12 = util.optimization_barrier %cst_2 : tensor<128x1xf32>
	%13 = tensor.collapse_shape %12 [[0, 1]] : tensor<128x1xf32> into tensor<128xf32>
	%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %11, %7, %13 : tensor<128x384xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) outs(%8 : tensor<128x384xf32>) {
	^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
	%15 = arith.mulf %arg1, %arg2 : f32
	%16 = arith.addf %15, %arg3 : f32
	%17 = math.rsqrt %16 : f32
	%18 = arith.mulf %arg0, %17 : f32
	linalg.yield %18 : f32
	} -> tensor<128x384xf32>
	check.expect_almost_eq(%14, %cst_1) : tensor<128x384xf32>
	return
	}

	func.func @layernorm_dynamic() {
	%cst = arith.constant 1.000000e+00 : f32
	%cst_0 = arith.constant 0.000000e+00 : f32
	%cst_1 = arith.constant dense<0.000000e+00> : tensor<128x384xf32>
	%cst_2 = flow.tensor.dynamic_constant dense<9.99999996E-13> : tensor<128x1xf32> -> tensor<?x1xf32>
	%cst_3 = flow.tensor.dynamic_constant dense<3.840000e+02> : tensor<128x1xf32> -> tensor<?x1xf32>
	%cst_4 = flow.tensor.dynamic_constant dense<5.000000e+00> : tensor<128x384xf32> -> tensor<?x?xf32>
	%c_0_index = arith.constant 0 : index
	%c_1_index = arith.constant 1 : index
	%dim_0 = tensor.dim %cst_4, %c_0_index : tensor<?x?xf32>
	%dim_1 = tensor.dim %cst_4, %c_1_index : tensor<?x?xf32>
	%2 = tensor.empty(%dim_0) : tensor<?xf32>
	%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<?xf32>) -> tensor<?xf32>
	%4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%cst_4 : tensor<?x?xf32>) outs(%3 : tensor<?xf32>) {
	^bb0(%arg0: f32, %arg1: f32):
	%15 = arith.addf %arg0, %arg1 : f32
	linalg.yield %15 : f32
	} -> tensor<?xf32>
	%5 = tensor.empty(%dim_0) : tensor<?x1xf32>
	%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_3 : tensor<?x1xf32>) outs(%5 : tensor<?x1xf32>) {
	^bb0(%arg0: f32, %arg1: f32):
	%15 = arith.divf %cst, %arg0 : f32
	linalg.yield %15 : f32
	} -> tensor<?x1xf32>
	%7 = tensor.collapse_shape %6 [[0, 1]] : tensor<?x1xf32> into tensor<?xf32>
	%8 = tensor.empty(%dim_0, %dim_1) : tensor<?x?xf32>
	%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_4, %4, %7 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?x?xf32>) {
	^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):
	%15 = arith.mulf %arg1, %arg2 : f32
	%16 = arith.subf %arg0, %15 : f32
	linalg.yield %16 : f32
	} -> tensor<?x?xf32>
	%10 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<?xf32>) -> tensor<?xf32>
	%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9 : tensor<?x?xf32>) outs(%10 : tensor<?xf32>) {
	^bb0(%arg0: f32, %arg1: f32):
	%15 = arith.mulf %arg0, %arg0 : f32
	%16 = arith.addf %15, %arg1 : f32
	linalg.yield %16 : f32
	} -> tensor<?xf32>
	%13 = tensor.collapse_shape %cst_2 [[0, 1]] : tensor<?x1xf32> into tensor<?xf32>
	%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %11, %7, %13 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?x?xf32>) {
	^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
	%15 = arith.mulf %arg1, %arg2 : f32
	%16 = arith.addf %15, %arg3 : f32
	%17 = math.rsqrt %16 : f32
	%18 = arith.mulf %arg0, %17 : f32
	linalg.yield %18 : f32
	} -> tensor<?x?xf32>
	%result = tensor.cast %14 : tensor<?x?xf32> to tensor<128x384xf32>
	check.expect_almost_eq(%result, %cst_1) : tensor<128x384xf32>
	return
	}