blob: 15f02dec1be7ec7df98a73341dacf92591e77587 [file] [log] [blame]
// y = gamma * (x-mean(x)) / rsqrt(var(x) + epsilon) + beta
// Setting gamma = 1.0 and beta = 0.0 for simplicity.
// Generated from this TOSA input:
// func.func @layernorm() {
// %x = util.unfoldable_constant dense<5.0> : tensor<128x384xf32>
// %c384 = util.unfoldable_constant dense<384.0> : tensor<128x1xf32>
// %sum = tosa.reduce_sum %x {axis = 1 : i64} : (tensor<128x384xf32>) -> tensor<128x1xf32>
// %r384 = tosa.reciprocal %c384 : (tensor<128x1xf32>) -> tensor<128x1xf32>
// %mean = tosa.mul %sum, %r384 {shift = 0 : i8} : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
// %x_sub_mean = tosa.sub %x, %mean : (tensor<128x384xf32>, tensor<128x1xf32>) -> tensor<128x384xf32>
// %square = tosa.mul %x_sub_mean, %x_sub_mean {shift = 0 : i8} : (tensor<128x384xf32>, tensor<128x384xf32>) -> tensor<128x384xf32>
// %square_sum = tosa.reduce_sum %square {axis = 1 : i64} : (tensor<128x384xf32>) -> tensor<128x1xf32>
// %variance = tosa.mul %square_sum, %r384 {shift = 0 : i8} : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
// %epsilon = util.unfoldable_constant dense<9.99999996E-13> : tensor<128x1xf32>
// %var_eps = tosa.add %variance, %epsilon : (tensor<128x1xf32>, tensor<128x1xf32>) -> tensor<128x1xf32>
// %rsigma = tosa.rsqrt %var_eps : (tensor<128x1xf32>) -> tensor<128x1xf32>
// %norm = tosa.mul %x_sub_mean, %rsigma {shift = 0 : i8} : (tensor<128x384xf32>, tensor<128x1xf32>) -> tensor<128x384xf32>
// check.expect_almost_eq_const(%norm, dense<0.0> : tensor<128x384xf32>) : tensor<128x384xf32>
// return
// }
func.func @layernorm() {
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0.000000e+00> : tensor<128x384xf32>
%cst_2 = arith.constant dense<9.99999996E-13> : tensor<128x1xf32>
%cst_3 = arith.constant dense<3.840000e+02> : tensor<128x1xf32>
%cst_4 = arith.constant dense<5.000000e+00> : tensor<128x384xf32>
%0 = util.optimization_barrier %cst_4 : tensor<128x384xf32>
%1 = util.optimization_barrier %cst_3 : tensor<128x1xf32>
%2 = tensor.empty() : tensor<128xf32>
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<128xf32>) -> tensor<128xf32>
%4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<128x384xf32>) outs(%3 : tensor<128xf32>) {
^bb0(%arg0: f32, %arg1: f32):
%15 = arith.addf %arg0, %arg1 : f32
linalg.yield %15 : f32
} -> tensor<128xf32>
%5 = tensor.empty() : tensor<128x1xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%1 : tensor<128x1xf32>) outs(%5 : tensor<128x1xf32>) {
^bb0(%arg0: f32, %arg1: f32):
%15 = arith.divf %cst, %arg0 : f32
linalg.yield %15 : f32
} -> tensor<128x1xf32>
%7 = tensor.collapse_shape %6 [[0, 1]] : tensor<128x1xf32> into tensor<128xf32>
%8 = tensor.empty() : tensor<128x384xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0, %4, %7 : tensor<128x384xf32>, tensor<128xf32>, tensor<128xf32>) outs(%8 : tensor<128x384xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):
%15 = arith.mulf %arg1, %arg2 : f32
%16 = arith.subf %arg0, %15 : f32
linalg.yield %16 : f32
} -> tensor<128x384xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<128xf32>) -> tensor<128xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9 : tensor<128x384xf32>) outs(%10 : tensor<128xf32>) {
^bb0(%arg0: f32, %arg1: f32):
%15 = arith.mulf %arg0, %arg0 : f32
%16 = arith.addf %15, %arg1 : f32
linalg.yield %16 : f32
} -> tensor<128xf32>
%12 = util.optimization_barrier %cst_2 : tensor<128x1xf32>
%13 = tensor.collapse_shape %12 [[0, 1]] : tensor<128x1xf32> into tensor<128xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %11, %7, %13 : tensor<128x384xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) outs(%8 : tensor<128x384xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
%15 = arith.mulf %arg1, %arg2 : f32
%16 = arith.addf %15, %arg3 : f32
%17 = math.rsqrt %16 : f32
%18 = arith.mulf %arg0, %17 : f32
linalg.yield %18 : f32
} -> tensor<128x384xf32>
check.expect_almost_eq(%14, %cst_1) : tensor<128x384xf32>
func.func @layernorm_dynamic() {
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
%cst_1 = arith.constant dense<0.000000e+00> : tensor<128x384xf32>
%cst_2 = flow.tensor.dynamic_constant dense<9.99999996E-13> : tensor<128x1xf32> -> tensor<?x1xf32>
%cst_3 = flow.tensor.dynamic_constant dense<3.840000e+02> : tensor<128x1xf32> -> tensor<?x1xf32>
%cst_4 = flow.tensor.dynamic_constant dense<5.000000e+00> : tensor<128x384xf32> -> tensor<?x?xf32>
%c_0_index = arith.constant 0 : index
%c_1_index = arith.constant 1 : index
%dim_0 = tensor.dim %cst_4, %c_0_index : tensor<?x?xf32>
%dim_1 = tensor.dim %cst_4, %c_1_index : tensor<?x?xf32>
%2 = tensor.empty(%dim_0) : tensor<?xf32>
%3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<?xf32>) -> tensor<?xf32>
%4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%cst_4 : tensor<?x?xf32>) outs(%3 : tensor<?xf32>) {
^bb0(%arg0: f32, %arg1: f32):
%15 = arith.addf %arg0, %arg1 : f32
linalg.yield %15 : f32
} -> tensor<?xf32>
%5 = tensor.empty(%dim_0) : tensor<?x1xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_3 : tensor<?x1xf32>) outs(%5 : tensor<?x1xf32>) {
^bb0(%arg0: f32, %arg1: f32):
%15 = arith.divf %cst, %arg0 : f32
linalg.yield %15 : f32
} -> tensor<?x1xf32>
%7 = tensor.collapse_shape %6 [[0, 1]] : tensor<?x1xf32> into tensor<?xf32>
%8 = tensor.empty(%dim_0, %dim_1) : tensor<?x?xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst_4, %4, %7 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?x?xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):
%15 = arith.mulf %arg1, %arg2 : f32
%16 = arith.subf %arg0, %15 : f32
linalg.yield %16 : f32
} -> tensor<?x?xf32>
%10 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<?xf32>) -> tensor<?xf32>
%11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%9 : tensor<?x?xf32>) outs(%10 : tensor<?xf32>) {
^bb0(%arg0: f32, %arg1: f32):
%15 = arith.mulf %arg0, %arg0 : f32
%16 = arith.addf %15, %arg1 : f32
linalg.yield %16 : f32
} -> tensor<?xf32>
%13 = tensor.collapse_shape %cst_2 [[0, 1]] : tensor<?x1xf32> into tensor<?xf32>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %11, %7, %13 : tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?x?xf32>) {
^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
%15 = arith.mulf %arg1, %arg2 : f32
%16 = arith.addf %15, %arg3 : f32
%17 = math.rsqrt %16 : f32
%18 = arith.mulf %arg0, %17 : f32
linalg.yield %18 : f32
} -> tensor<?x?xf32>
%result = tensor.cast %14 : tensor<?x?xf32> to tensor<128x384xf32>
check.expect_almost_eq(%result, %cst_1) : tensor<128x384xf32>