| #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | 
 | #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> | 
 | module { | 
 |   func.func @softmax(%arg0: tensor<16x128x128xf32>) -> tensor<16x128x128xf32> { | 
 |     %cst = arith.constant 0.000000e+00 : f32 | 
 |     %0 = tensor.empty() : tensor<16x128xf32> | 
 |     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x128xf32>) -> tensor<16x128xf32> | 
 |     %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<16x128xf32>) -> tensor<16x128xf32> | 
 |     %3 = tensor.empty() : tensor<16x128x128xf32> | 
 |     %4 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0, %2 : tensor<16x128x128xf32>, tensor<16x128xf32>) outs(%3 : tensor<16x128x128xf32>) { | 
 |     ^bb0(%in: f32, %in_0: f32, %out: f32): | 
 |       %6 = arith.addf %in, %in_0 : f32 | 
 |       linalg.yield %6 : f32 | 
 |     } -> tensor<16x128x128xf32> | 
 |     %5 = util.optimization_barrier %4 : tensor<16x128x128xf32> | 
 |     return %4 : tensor<16x128x128xf32> | 
 |   } | 
 | } |