| // Test large aligned linalg matmul to make sure we go through the optimized | 
 | // path for GPUs. | 
 |  | 
 | // Problem size      : 2048x512x1024 | 
 | // Input type        : F32 | 
 | // Accumulation type : F32 | 
 | func.func @matmul_2048x512x1024_f32_f32() { | 
 |   %lhs = util.unfoldable_constant dense<1.0> : tensor<2048x1024xf32> | 
 |   %rhs = util.unfoldable_constant dense<0.4> : tensor<1024x512xf32> | 
 |   %c0 = arith.constant 0.0 : f32 | 
 |   %init = tensor.empty() : tensor<2048x512xf32> | 
 |   %CC = linalg.fill ins(%c0 : f32) outs(%init : tensor<2048x512xf32>) -> tensor<2048x512xf32> | 
 |   %D = linalg.matmul ins(%lhs, %rhs: tensor<2048x1024xf32>, tensor<1024x512xf32>) | 
 |                     outs(%CC: tensor<2048x512xf32>) -> tensor<2048x512xf32> | 
 |   check.expect_almost_eq_const(%D, dense<409.596> : tensor<2048x512xf32>) : tensor<2048x512xf32> | 
 |   return | 
 | } | 
 |  | 
 | // Problem size      : 3456x1024x2048 | 
 | // Input type        : F16 | 
 | // Accumulation type : F16 | 
 | func.func @matmul_3456x1024x2048_f16_f16() { | 
 |   %lhs = util.unfoldable_constant dense<1.00> : tensor<3456x2048xf16> | 
 |   %rhs = util.unfoldable_constant dense<0.01> : tensor<2048x1024xf16> | 
 |   %c0 = arith.constant 0.0 : f16 | 
 |   %init = tensor.empty() : tensor<3456x1024xf16> | 
 |   %CC = linalg.fill ins(%c0 : f16) outs(%init : tensor<3456x1024xf16>) -> tensor<3456x1024xf16> | 
 |   %D = linalg.matmul ins(%lhs, %rhs: tensor<3456x2048xf16>, tensor<2048x1024xf16>) | 
 |                     outs(%CC: tensor<3456x1024xf16>) -> tensor<3456x1024xf16> | 
 |   check.expect_almost_eq_const(%D, dense<20.2812> : tensor<3456x1024xf16>) : tensor<3456x1024xf16> | 
 |   return | 
 | } |