| #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> |
| #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> |
| module { |
| func.func @softmax(%arg0: tensor<16x128x128xf32>) -> tensor<16x128x128xf32> { |
| %cst = arith.constant 0.000000e+00 : f32 |
| %0 = tensor.empty() : tensor<16x128xf32> |
| %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x128xf32>) -> tensor<16x128xf32> |
| %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<16x128xf32>) -> tensor<16x128xf32> |
| %3 = tensor.empty() : tensor<16x128x128xf32> |
| %4 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg0, %2 : tensor<16x128x128xf32>, tensor<16x128xf32>) outs(%3 : tensor<16x128x128xf32>) { |
| ^bb0(%in: f32, %in_0: f32, %out: f32): |
| %6 = arith.addf %in, %in_0 : f32 |
| linalg.yield %6 : f32 |
| } -> tensor<16x128x128xf32> |
| %5 = util.optimization_barrier %4 : tensor<16x128x128xf32> |
| return %4 : tensor<16x128x128xf32> |
| } |
| } |