Merge pull request #7265 from MaheshRavishankar:main-to-google

PiperOrigin-RevId: 401101859
diff --git a/SUBMODULE_VERSIONS.txt b/SUBMODULE_VERSIONS.txt
index 89a3db9..2d928d9 100644
--- a/SUBMODULE_VERSIONS.txt
+++ b/SUBMODULE_VERSIONS.txt
@@ -4,7 +4,7 @@
 aa533abfd4232b01f9e57041d70114d5a77e6de0 third_party/googletest
 88b845dee001723c4a0db1fe5477de735b6d3bb0 third_party/liburing
 acd6f6f014c25e46363e718381e0b35205df2d83 third_party/libyaml
-5f7a5353301b776ffb0e5fb048992898507bf7ee third_party/llvm-project
+471b25e217e635e058bbdbca8c693e2998380a60 third_party/llvm-project
 777b5c11c09fbc2e19974054351b94b3aa7ae6d0 third_party/mlir-hlo
 3f701faace7addc75d16dea8a6cd769fa5b3f260 third_party/musl
 4c7697dbe973ed01ae6fbec37d186ebd05982e1f third_party/pybind11
diff --git a/build_tools/benchmarks/run_benchmarks_on_android.py b/build_tools/benchmarks/run_benchmarks_on_android.py
index 0271a72..1c13a9c 100755
--- a/build_tools/benchmarks/run_benchmarks_on_android.py
+++ b/build_tools/benchmarks/run_benchmarks_on_android.py
@@ -249,10 +249,10 @@
 
     # We can choose this benchmark if it matches the driver and CPU/GPU
     # architecture.
-    matched_driver = (driver_filter is None or
-                      iree_driver == driver_filter.lower())
-    matched_arch = (target_arch == cpu_target_arch or
-                    target_arch == gpu_target_arch)
+    matched_driver = (
+        driver_filter is None or iree_driver == driver_filter.lower())
+    matched_arch = (
+        target_arch == cpu_target_arch or target_arch == gpu_target_arch)
     should_choose = matched_driver and matched_arch
     if should_choose:
       matched_benchmarks.append(root)
@@ -477,10 +477,11 @@
   parser.add_argument("--capture_tarball",
                       default=None,
                       help="Path to the tarball for captures")
-  parser.add_argument("--no-clean",
-                      action="store_true",
-                      help="Do not clean up the temporary directory used for "
-                      "benchmarking on the Android device")
+  parser.add_argument(
+      "--no-clean",
+      action="store_true",
+      help="Do not clean up the temporary directory used for "
+      "benchmarking on the Android device")
   parser.add_argument("--verbose",
                       action="store_true",
                       help="Print internal information during execution")
diff --git a/iree/compiler/Dialect/Flow/Transforms/PadLinalgOps.cpp b/iree/compiler/Dialect/Flow/Transforms/PadLinalgOps.cpp
index 19786fd..48cba52 100644
--- a/iree/compiler/Dialect/Flow/Transforms/PadLinalgOps.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/PadLinalgOps.cpp
@@ -81,7 +81,7 @@
         (paddingForM > 0 || paddingForK > 0)
             ? linalg::PadTensorOp::createPadScalarOp(
                   lhsPaddedType, lhs, lhsPaddingValue, createPadding({0, 0}),
-                  createPadding({paddingForM, paddingForK}), /*packing=*/false,
+                  createPadding({paddingForM, paddingForK}), /*nofold=*/false,
                   loc, rewriter)
             : lhs;
 
@@ -89,7 +89,7 @@
         (paddingForK > 0 || paddingForN > 0)
             ? linalg::PadTensorOp::createPadScalarOp(
                   rhsPaddedType, rhs, rhsPaddingValue, createPadding({0, 0}),
-                  createPadding({paddingForK, paddingForN}), /*packing=*/false,
+                  createPadding({paddingForK, paddingForN}), /*nofold=*/false,
                   loc, rewriter)
             : rhs;
 
@@ -107,7 +107,7 @@
           loc, rewriter.getZeroAttr(resultType.getElementType()));
       Value paddedResult = linalg::PadTensorOp::createPadScalarOp(
           newResultType, result, resultPaddingValue, createPadding({0, 0}),
-          createPadding({paddingForM, paddingForN}), /*packing=*/false, loc,
+          createPadding({paddingForM, paddingForN}), /*nofold=*/false, loc,
           rewriter);
       auto paddedMatmulOp =
           cast<linalg::LinalgOp>(matmulOp.getOperation())
diff --git a/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp b/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp
index 380e4c9..1f5084e 100644
--- a/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp
+++ b/iree/compiler/InputConversion/MHLO/MHLOToMHLOPreprocessing.cpp
@@ -141,9 +141,9 @@
     paddingHigh.append(rank, 0);
     interiorPadding.append(rank, 0);
     for (auto iter :
-         llvm::enumerate(op.dimension_numbers().input_spatial_dimensions())) {
+         llvm::enumerate(op.dimension_numbers().getInputSpatialDimensions())) {
       unsigned idx = iter.index();
-      unsigned dim = iter.value().getZExtValue();
+      unsigned dim = iter.value();
       paddingLow[dim] = op.paddingAttr().getValue<int64_t>({idx, 0});
       paddingHigh[dim] = op.paddingAttr().getValue<int64_t>({idx, 1});
     }
@@ -195,19 +195,13 @@
     }
 
     auto dimensionNumbers = op.dimension_numbers();
-    auto inputSpatialDimensions = dimensionNumbers.input_spatial_dimensions();
-    llvm::SmallVector<int64_t, 4> spatialDims;
-    for (auto dim : inputSpatialDimensions) {
-      spatialDims.push_back(dim.getSExtValue());
-    }
+    auto spatialDims = dimensionNumbers.getInputSpatialDimensions();
 
     // Compute the permutation required to create a standard order.
     llvm::SmallVector<int64_t, 4> permutations;
-    permutations.push_back(
-        dimensionNumbers.input_batch_dimension().getValue().getSExtValue());
+    permutations.push_back(dimensionNumbers.getInputBatchDimension());
     permutations.append(spatialDims.begin(), spatialDims.end());
-    permutations.push_back(
-        dimensionNumbers.input_feature_dimension().getValue().getSExtValue());
+    permutations.push_back(dimensionNumbers.getInputFeatureDimension());
 
     // If the permutation is iota then no reordering is required.
     if (isIota(permutations)) {
@@ -227,18 +221,17 @@
     llvm::SmallVector<int64_t, 4> newSpatialDimensions(spatialDims.size());
     std::iota(newSpatialDimensions.begin(), newSpatialDimensions.end(), 1);
 
-    auto newDimensionNumbers = mhlo::ConvDimensionNumbers::get(
-        /*input_batch_dimension=*/rewriter.getI64IntegerAttr(0),
-        /*input_feature_dimension=*/
-        rewriter.getI64IntegerAttr(newSpatialDimensions.size() + 1),
-        /*input_spatial_dimensions=*/
-        rewriter.getI64TensorAttr(newSpatialDimensions),
-        dimensionNumbers.kernel_input_feature_dimension(),
-        dimensionNumbers.kernel_output_feature_dimension(),
-        dimensionNumbers.kernel_spatial_dimensions(),
-        dimensionNumbers.output_batch_dimension(),
-        dimensionNumbers.output_feature_dimension(),
-        dimensionNumbers.output_spatial_dimensions(), op.getContext());
+    auto newDimensionNumbers = mhlo::ConvDimensionNumbersAttr::get(
+        op.getContext(),
+        /*input_batch_dimension=*/0,
+        /*input_feature_dimension=*/newSpatialDimensions.size() + 1,
+        /*input_spatial_dimensions=*/newSpatialDimensions,
+        dimensionNumbers.getKernelInputFeatureDimension(),
+        dimensionNumbers.getKernelOutputFeatureDimension(),
+        dimensionNumbers.getKernelSpatialDimensions(),
+        dimensionNumbers.getOutputBatchDimension(),
+        dimensionNumbers.getOutputFeatureDimension(),
+        dimensionNumbers.getOutputSpatialDimensions());
 
     SmallVector<Value, 2> operands = {transposed, op.rhs()};
     auto newConv = rewriter.create<mhlo::ConvOp>(op.getLoc(), op.getType(),
@@ -261,19 +254,16 @@
 
     auto dimensionNumbers = op.dimension_numbers();
 
-    auto inputSpatialDimensions = dimensionNumbers.kernel_spatial_dimensions();
-    llvm::SmallVector<int64_t, 4> spatialDims;
-    for (auto dim : inputSpatialDimensions) {
-      spatialDims.push_back(dim.getSExtValue());
-    }
+    auto spatialDims = dimensionNumbers.getKernelSpatialDimensions();
 
     auto inputFeatureDimension =
-        dimensionNumbers.kernel_input_feature_dimension().getInt();
+        dimensionNumbers.getKernelInputFeatureDimension();
     auto outputFeatureDimension =
-        dimensionNumbers.kernel_output_feature_dimension().getInt();
+        dimensionNumbers.getKernelOutputFeatureDimension();
 
     // Compute the permutation for the transpose.
-    llvm::SmallVector<int64_t, 4> permutation(spatialDims);
+    llvm::SmallVector<int64_t, 4> permutation(spatialDims.begin(),
+                                              spatialDims.end());
     permutation.push_back(inputFeatureDimension);
     permutation.push_back(outputFeatureDimension);
 
@@ -293,18 +283,17 @@
         RankedTensorType::get(transposeShape, kernelType.getElementType()),
         kernel, rewriter.getI64TensorAttr(permutation));
 
-    auto newDimensionNumbers = mhlo::ConvDimensionNumbers::get(
-        dimensionNumbers.input_batch_dimension(),
-        dimensionNumbers.input_feature_dimension(),
-        dimensionNumbers.input_spatial_dimensions(),
+    auto newDimensionNumbers = mhlo::ConvDimensionNumbersAttr::get(
+        op.getContext(), dimensionNumbers.getInputBatchDimension(),
+        dimensionNumbers.getInputFeatureDimension(),
+        dimensionNumbers.getInputSpatialDimensions(),
         /*kernel_input_feature_dimension=*/
-        rewriter.getI64IntegerAttr(newSpatialDimensions.size()),
+        newSpatialDimensions.size(),
         /*kernel_output_feature_dimension=*/
-        rewriter.getI64IntegerAttr(newSpatialDimensions.size() + 1),
-        rewriter.getI64TensorAttr(newSpatialDimensions),
-        dimensionNumbers.output_batch_dimension(),
-        dimensionNumbers.output_feature_dimension(),
-        dimensionNumbers.output_spatial_dimensions(), op.getContext());
+        newSpatialDimensions.size() + 1, newSpatialDimensions,
+        dimensionNumbers.getOutputBatchDimension(),
+        dimensionNumbers.getOutputFeatureDimension(),
+        dimensionNumbers.getOutputSpatialDimensions());
 
     SmallVector<Value, 2> operands = {op.lhs(), transposeKernel};
     mhlo::ConvOp newConv = rewriter.create<mhlo::ConvOp>(
@@ -330,19 +319,13 @@
     }
 
     auto dimensionNumbers = op.dimension_numbers();
-    auto outputSpatialDimensions = dimensionNumbers.output_spatial_dimensions();
-    llvm::SmallVector<int64_t, 4> spatialDims;
-    for (auto dim : outputSpatialDimensions) {
-      spatialDims.push_back(dim.getSExtValue());
-    }
+    auto spatialDims = dimensionNumbers.getOutputSpatialDimensions();
 
     // Compute the permutation to transpose to an ordered output.
     llvm::SmallVector<int64_t, 4> permutation;
-    permutation.push_back(
-        dimensionNumbers.output_batch_dimension().getValue().getSExtValue());
+    permutation.push_back(dimensionNumbers.getOutputBatchDimension());
     permutation.append(spatialDims.begin(), spatialDims.end());
-    permutation.push_back(
-        dimensionNumbers.output_feature_dimension().getValue().getSExtValue());
+    permutation.push_back(dimensionNumbers.getOutputFeatureDimension());
 
     // If the permutation is iota then no reordering is required.
     if (isIota(permutation)) {
@@ -364,18 +347,16 @@
     llvm::SmallVector<int64_t, 4> newSpatialDimensions(spatialDims.size());
     std::iota(newSpatialDimensions.begin(), newSpatialDimensions.end(), 1);
 
-    auto newDimensionNumbers = mhlo::ConvDimensionNumbers::get(
-        dimensionNumbers.input_batch_dimension(),
-        dimensionNumbers.input_feature_dimension(),
-        dimensionNumbers.input_spatial_dimensions(),
-        dimensionNumbers.kernel_input_feature_dimension(),
-        dimensionNumbers.kernel_output_feature_dimension(),
-        dimensionNumbers.kernel_spatial_dimensions(),
-        /*output_batch_dimension=*/rewriter.getI64IntegerAttr(0),
-        /*output_feature_dimension=*/
-        rewriter.getI64IntegerAttr(newSpatialDimensions.size() + 1),
-        /*output_spatial_dimensions=*/
-        rewriter.getI64TensorAttr(newSpatialDimensions), op.getContext());
+    auto newDimensionNumbers = mhlo::ConvDimensionNumbersAttr::get(
+        op.getContext(), dimensionNumbers.getInputBatchDimension(),
+        dimensionNumbers.getInputFeatureDimension(),
+        dimensionNumbers.getInputSpatialDimensions(),
+        dimensionNumbers.getKernelInputFeatureDimension(),
+        dimensionNumbers.getKernelOutputFeatureDimension(),
+        dimensionNumbers.getKernelSpatialDimensions(),
+        /*output_batch_dimension=*/0,
+        /*output_feature_dimension=*/newSpatialDimensions.size() + 1,
+        /*output_spatial_dimensions=*/newSpatialDimensions);
 
     SmallVector<Value, 2> operands = {op.lhs(), op.rhs()};
     auto newConv = rewriter.create<mhlo::ConvOp>(
@@ -464,10 +445,10 @@
 
   LogicalResult matchAndRewrite(mhlo::ConvOp op,
                                 PatternRewriter &rewriter) const override {
-    const auto featureInDim =
-        op.dimension_numbers().kernel_input_feature_dimension().getInt();
-    const auto featureOutDim =
-        op.dimension_numbers().kernel_output_feature_dimension().getInt();
+    int64_t featureInDim =
+        op.dimension_numbers().getKernelInputFeatureDimension();
+    int64_t featureOutDim =
+        op.dimension_numbers().getKernelOutputFeatureDimension();
     const auto &kernelShape = op.rhs().getType().cast<ShapedType>().getShape();
     if (kernelShape[featureInDim] != 1) return failure();
 
diff --git a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
index fbe2a7b..0ae429e 100644
--- a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
+++ b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing.mlir
@@ -38,16 +38,17 @@
     %0 = "mhlo.reshape"(%arg1) : (tensor<2x2x2x3xf32>) -> tensor<2x2x1x6xf32>
     %1 = "mhlo.convolution"(%arg0, %0) {
       batch_group_count = 1 : i64,
-      dimension_numbers = {
-        input_batch_dimension = 0 : i64,
-        input_feature_dimension = 3 : i64,
-        input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-        kernel_input_feature_dimension = 2 : i64,
-        kernel_output_feature_dimension = 3 : i64,
-        kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-        output_batch_dimension = 0 : i64,
-        output_feature_dimension = 3 : i64,
-        output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+      dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 3,
+      input_spatial_dimensions = [1, 2],
+      kernel_input_feature_dimension = 2,
+      kernel_output_feature_dimension = 3,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 0,
+      output_feature_dimension = 3,
+      output_spatial_dimensions = [1, 2]
+    >,
      feature_group_count = 2 : i64,
      padding = dense<0> : tensor<2x2xi64>,
      rhs_dilation = dense<1> : tensor<2xi64>,
diff --git a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir
index 686fd4e..c3a4f6a 100644
--- a/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir
+++ b/iree/compiler/InputConversion/MHLO/test/mhlo_to_mhlo_preprocessing_extract_pad_from_conv.mlir
@@ -9,16 +9,17 @@
 func @conv(%inputs: tensor<1x4x5x2xf32>, %weights: tensor<3x2x2x1xf32>) -> tensor<1x4x5x1xf32> {
   %0 = "mhlo.convolution"(%inputs, %weights) {
   batch_group_count = 1 : i64,
-  dimension_numbers = {
-    input_batch_dimension = 0 : i64,
-    input_feature_dimension = 3 : i64,
-    input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-    kernel_input_feature_dimension = 2 : i64,
-    kernel_output_feature_dimension = 3 : i64,
-    kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-    output_batch_dimension = 0 : i64,
-    output_feature_dimension = 3 : i64,
-    output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+  dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 3,
+      input_spatial_dimensions = [1, 2],
+      kernel_input_feature_dimension = 2,
+      kernel_output_feature_dimension = 3,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 0,
+      output_feature_dimension = 3,
+      output_spatial_dimensions = [1, 2]
+    >,
   feature_group_count = 1 : i64,
   padding = dense<[[1, 1], [0, 1]]> : tensor<2x2xi64>,
   rhs_dilation = dense<1> : tensor<2xi64>,
diff --git a/iree/test/e2e/models/edge_detection.mlir b/iree/test/e2e/models/edge_detection.mlir
index 5fbf4d2..6e65f21 100644
--- a/iree/test/e2e/models/edge_detection.mlir
+++ b/iree/test/e2e/models/edge_detection.mlir
@@ -12,9 +12,9 @@
   func @edge_detect_sobel_operator(%arg0: tensor<1x128x128x1xf32>) -> tensor<1x128x128x1xf32> {
     %0 = mhlo.constant dense<[[[[-1.000000e+00]], [[0.000000e+00]], [[1.000000e+00]]], [[[-2.000000e+00]], [[0.000000e+00]], [[2.000000e+00]]], [[[-1.000000e+00]], [[0.000000e+00]], [[1.000000e+00]]]]> : tensor<3x3x1x1xf32>
     %1 = mhlo.constant dense<[[[[1.000000e+00]], [[2.000000e+00]], [[1.000000e+00]]], [[[0.000000e+00]], [[0.000000e+00]], [[0.000000e+00]]], [[[-1.000000e+00]], [[-2.000000e+00]], [[-1.000000e+00]]]]> : tensor<3x3x1x1xf32>
-    %2 = "mhlo.convolution"(%arg0, %0) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x128x128x1xf32>, tensor<3x3x1x1xf32>) -> tensor<1x128x128x1xf32>
+    %2 = "mhlo.convolution"(%arg0, %0) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x128x128x1xf32>, tensor<3x3x1x1xf32>) -> tensor<1x128x128x1xf32>
     %3 = mhlo.multiply %2, %2 : tensor<1x128x128x1xf32>
-    %4 = "mhlo.convolution"(%arg0, %1) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x128x128x1xf32>, tensor<3x3x1x1xf32>) -> tensor<1x128x128x1xf32>
+    %4 = "mhlo.convolution"(%arg0, %1) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x128x128x1xf32>, tensor<3x3x1x1xf32>) -> tensor<1x128x128x1xf32>
     %5 = mhlo.multiply %4, %4 : tensor<1x128x128x1xf32>
     %6 = mhlo.add %3, %5 : tensor<1x128x128x1xf32>
     %7 = "mhlo.sqrt"(%6) : (tensor<1x128x128x1xf32>) -> tensor<1x128x128x1xf32>
diff --git a/iree/test/e2e/models/mobilenetv3_fake_weights.mlir b/iree/test/e2e/models/mobilenetv3_fake_weights.mlir
index 5bcdccc..d6aae0f 100644
--- a/iree/test/e2e/models/mobilenetv3_fake_weights.mlir
+++ b/iree/test/e2e/models/mobilenetv3_fake_weights.mlir
@@ -690,7 +690,7 @@
     %476 = util.global.load.indirect %208 : !util.ptr<tensor<1x1x1024x1000xf32>> -> tensor<1x1x1024x1000xf32>
     %477 = mhlo.multiply %arg0, %210 : tensor<1x224x224x3xf32>
     %478 = mhlo.add %477, %211 : tensor<1x224x224x3xf32>
-    %479 = "mhlo.convolution"(%478, %278) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<[[0, 1], [0, 1]]> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x224x224x3xf32>, tensor<3x3x3x16xf32>) -> tensor<1x112x112x16xf32>
+    %479 = "mhlo.convolution"(%478, %278) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<[[0, 1], [0, 1]]> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x224x224x3xf32>, tensor<3x3x3x16xf32>) -> tensor<1x112x112x16xf32>
     %480 = "mhlo.batch_norm_inference"(%479, %277, %276, %275, %274) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x112x112x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<1x112x112x16xf32>
     %481 = mhlo.add %480, %212 : tensor<1x112x112x16xf32>
     %482 = "mhlo.clamp"(%266, %481, %264) : (tensor<f32>, tensor<1x112x112x16xf32>, tensor<f32>) -> tensor<1x112x112x16xf32>
@@ -698,7 +698,7 @@
     %484 = mhlo.multiply %483, %480 : tensor<1x112x112x16xf32>
     %485 = "mhlo.pad"(%484, %266) {edge_padding_high = dense<[0, 1, 1, 0]> : tensor<4xi64>, edge_padding_low = dense<0> : tensor<4xi64>, interior_padding = dense<0> : tensor<4xi64>} : (tensor<1x112x112x16xf32>, tensor<f32>) -> tensor<1x113x113x16xf32>
     %486 = "mhlo.reshape"(%465) : (tensor<3x3x16x1xf32>) -> tensor<3x3x1x16xf32>
-    %487 = "mhlo.convolution"(%485, %486) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 16 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x113x113x16xf32>, tensor<3x3x1x16xf32>) -> tensor<1x56x56x16xf32>
+    %487 = "mhlo.convolution"(%485, %486) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 16 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x113x113x16xf32>, tensor<3x3x1x16xf32>) -> tensor<1x56x56x16xf32>
     %488 = "mhlo.batch_norm_inference"(%487, %464, %463, %462, %461) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x56x56x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<1x56x56x16xf32>
     %489 = mhlo.maximum %488, %246 : tensor<1x56x56x16xf32>
     %490 = "mhlo.reduce"(%489, %266) ( {
@@ -708,11 +708,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x56x56x16xf32>, tensor<f32>) -> tensor<1x16xf32>
     %491 = mhlo.divide %490, %247 : tensor<1x16xf32>
     %492 = "mhlo.reshape"(%491) : (tensor<1x16xf32>) -> tensor<1x1x1x16xf32>
-    %493 = "mhlo.convolution"(%492, %474) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x16xf32>, tensor<1x1x16x8xf32>) -> tensor<1x1x1x8xf32>
+    %493 = "mhlo.convolution"(%492, %474) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x16xf32>, tensor<1x1x16x8xf32>) -> tensor<1x1x1x8xf32>
     %494 = "mhlo.broadcast_in_dim"(%473) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>) -> tensor<1x1x1x8xf32>
     %495 = mhlo.add %493, %494 : tensor<1x1x1x8xf32>
     %496 = mhlo.maximum %495, %248 : tensor<1x1x1x8xf32>
-    %497 = "mhlo.convolution"(%496, %472) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x8xf32>, tensor<1x1x8x16xf32>) -> tensor<1x1x1x16xf32>
+    %497 = "mhlo.convolution"(%496, %472) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x8xf32>, tensor<1x1x8x16xf32>) -> tensor<1x1x1x16xf32>
     %498 = "mhlo.broadcast_in_dim"(%471) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<16xf32>) -> tensor<1x1x1x16xf32>
     %499 = mhlo.add %497, %498 : tensor<1x1x1x16xf32>
     %500 = mhlo.add %499, %213 : tensor<1x1x1x16xf32>
@@ -720,29 +720,29 @@
     %502 = mhlo.multiply %501, %230 : tensor<1x1x1x16xf32>
     %503 = "mhlo.broadcast_in_dim"(%502) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x16xf32>) -> tensor<1x56x56x16xf32>
     %504 = mhlo.multiply %489, %503 : tensor<1x56x56x16xf32>
-    %505 = "mhlo.convolution"(%504, %470) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x16xf32>, tensor<1x1x16x16xf32>) -> tensor<1x56x56x16xf32>
+    %505 = "mhlo.convolution"(%504, %470) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x16xf32>, tensor<1x1x16x16xf32>) -> tensor<1x56x56x16xf32>
     %506 = "mhlo.batch_norm_inference"(%505, %469, %468, %467, %466) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x56x56x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) -> tensor<1x56x56x16xf32>
-    %507 = "mhlo.convolution"(%506, %307) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x16xf32>, tensor<1x1x16x72xf32>) -> tensor<1x56x56x72xf32>
+    %507 = "mhlo.convolution"(%506, %307) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x16xf32>, tensor<1x1x16x72xf32>) -> tensor<1x56x56x72xf32>
     %508 = "mhlo.batch_norm_inference"(%507, %306, %305, %304, %303) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x56x56x72xf32>, tensor<72xf32>, tensor<72xf32>, tensor<72xf32>, tensor<72xf32>) -> tensor<1x56x56x72xf32>
     %509 = mhlo.maximum %508, %249 : tensor<1x56x56x72xf32>
     %510 = "mhlo.pad"(%509, %266) {edge_padding_high = dense<[0, 1, 1, 0]> : tensor<4xi64>, edge_padding_low = dense<0> : tensor<4xi64>, interior_padding = dense<0> : tensor<4xi64>} : (tensor<1x56x56x72xf32>, tensor<f32>) -> tensor<1x57x57x72xf32>
     %511 = "mhlo.reshape"(%302) : (tensor<3x3x72x1xf32>) -> tensor<3x3x1x72xf32>
-    %512 = "mhlo.convolution"(%510, %511) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 72 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x57x57x72xf32>, tensor<3x3x1x72xf32>) -> tensor<1x28x28x72xf32>
+    %512 = "mhlo.convolution"(%510, %511) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 72 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x57x57x72xf32>, tensor<3x3x1x72xf32>) -> tensor<1x28x28x72xf32>
     %513 = "mhlo.batch_norm_inference"(%512, %301, %300, %299, %298) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x28x28x72xf32>, tensor<72xf32>, tensor<72xf32>, tensor<72xf32>, tensor<72xf32>) -> tensor<1x28x28x72xf32>
     %514 = mhlo.maximum %513, %250 : tensor<1x28x28x72xf32>
-    %515 = "mhlo.convolution"(%514, %312) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x72xf32>, tensor<1x1x72x24xf32>) -> tensor<1x28x28x24xf32>
+    %515 = "mhlo.convolution"(%514, %312) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x72xf32>, tensor<1x1x72x24xf32>) -> tensor<1x28x28x24xf32>
     %516 = "mhlo.batch_norm_inference"(%515, %311, %310, %309, %308) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x28x28x24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>) -> tensor<1x28x28x24xf32>
-    %517 = "mhlo.convolution"(%516, %322) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x24xf32>, tensor<1x1x24x88xf32>) -> tensor<1x28x28x88xf32>
+    %517 = "mhlo.convolution"(%516, %322) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x24xf32>, tensor<1x1x24x88xf32>) -> tensor<1x28x28x88xf32>
     %518 = "mhlo.batch_norm_inference"(%517, %321, %320, %319, %318) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x28x28x88xf32>, tensor<88xf32>, tensor<88xf32>, tensor<88xf32>, tensor<88xf32>) -> tensor<1x28x28x88xf32>
     %519 = mhlo.maximum %518, %251 : tensor<1x28x28x88xf32>
     %520 = "mhlo.reshape"(%317) : (tensor<3x3x88x1xf32>) -> tensor<3x3x1x88xf32>
-    %521 = "mhlo.convolution"(%519, %520) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 88 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x88xf32>, tensor<3x3x1x88xf32>) -> tensor<1x28x28x88xf32>
+    %521 = "mhlo.convolution"(%519, %520) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 88 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x88xf32>, tensor<3x3x1x88xf32>) -> tensor<1x28x28x88xf32>
     %522 = "mhlo.batch_norm_inference"(%521, %316, %315, %314, %313) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x28x28x88xf32>, tensor<88xf32>, tensor<88xf32>, tensor<88xf32>, tensor<88xf32>) -> tensor<1x28x28x88xf32>
     %523 = mhlo.maximum %522, %251 : tensor<1x28x28x88xf32>
-    %524 = "mhlo.convolution"(%523, %327) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x88xf32>, tensor<1x1x88x24xf32>) -> tensor<1x28x28x24xf32>
+    %524 = "mhlo.convolution"(%523, %327) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x88xf32>, tensor<1x1x88x24xf32>) -> tensor<1x28x28x24xf32>
     %525 = "mhlo.batch_norm_inference"(%524, %326, %325, %324, %323) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x28x28x24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>, tensor<24xf32>) -> tensor<1x28x28x24xf32>
     %526 = mhlo.add %516, %525 : tensor<1x28x28x24xf32>
-    %527 = "mhlo.convolution"(%526, %337) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x24xf32>, tensor<1x1x24x96xf32>) -> tensor<1x28x28x96xf32>
+    %527 = "mhlo.convolution"(%526, %337) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x24xf32>, tensor<1x1x24x96xf32>) -> tensor<1x28x28x96xf32>
     %528 = "mhlo.batch_norm_inference"(%527, %336, %335, %334, %333) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x28x28x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) -> tensor<1x28x28x96xf32>
     %529 = mhlo.add %528, %214 : tensor<1x28x28x96xf32>
     %530 = "mhlo.clamp"(%266, %529, %264) : (tensor<f32>, tensor<1x28x28x96xf32>, tensor<f32>) -> tensor<1x28x28x96xf32>
@@ -750,7 +750,7 @@
     %532 = mhlo.multiply %531, %528 : tensor<1x28x28x96xf32>
     %533 = "mhlo.pad"(%532, %266) {edge_padding_high = dense<[0, 2, 2, 0]> : tensor<4xi64>, edge_padding_low = dense<[0, 1, 1, 0]> : tensor<4xi64>, interior_padding = dense<0> : tensor<4xi64>} : (tensor<1x28x28x96xf32>, tensor<f32>) -> tensor<1x31x31x96xf32>
     %534 = "mhlo.reshape"(%332) : (tensor<5x5x96x1xf32>) -> tensor<5x5x1x96xf32>
-    %535 = "mhlo.convolution"(%533, %534) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 96 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x31x31x96xf32>, tensor<5x5x1x96xf32>) -> tensor<1x14x14x96xf32>
+    %535 = "mhlo.convolution"(%533, %534) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 96 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x31x31x96xf32>, tensor<5x5x1x96xf32>) -> tensor<1x14x14x96xf32>
     %536 = "mhlo.batch_norm_inference"(%535, %331, %330, %329, %328) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) -> tensor<1x14x14x96xf32>
     %537 = mhlo.add %536, %215 : tensor<1x14x14x96xf32>
     %538 = "mhlo.clamp"(%266, %537, %264) : (tensor<f32>, tensor<1x14x14x96xf32>, tensor<f32>) -> tensor<1x14x14x96xf32>
@@ -763,11 +763,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x14x14x96xf32>, tensor<f32>) -> tensor<1x96xf32>
     %542 = mhlo.divide %541, %252 : tensor<1x96xf32>
     %543 = "mhlo.reshape"(%542) : (tensor<1x96xf32>) -> tensor<1x1x1x96xf32>
-    %544 = "mhlo.convolution"(%543, %346) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x96xf32>, tensor<1x1x96x24xf32>) -> tensor<1x1x1x24xf32>
+    %544 = "mhlo.convolution"(%543, %346) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x96xf32>, tensor<1x1x96x24xf32>) -> tensor<1x1x1x24xf32>
     %545 = "mhlo.broadcast_in_dim"(%345) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<24xf32>) -> tensor<1x1x1x24xf32>
     %546 = mhlo.add %544, %545 : tensor<1x1x1x24xf32>
     %547 = mhlo.maximum %546, %253 : tensor<1x1x1x24xf32>
-    %548 = "mhlo.convolution"(%547, %344) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x24xf32>, tensor<1x1x24x96xf32>) -> tensor<1x1x1x96xf32>
+    %548 = "mhlo.convolution"(%547, %344) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x24xf32>, tensor<1x1x24x96xf32>) -> tensor<1x1x1x96xf32>
     %549 = "mhlo.broadcast_in_dim"(%343) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<96xf32>) -> tensor<1x1x1x96xf32>
     %550 = mhlo.add %548, %549 : tensor<1x1x1x96xf32>
     %551 = mhlo.add %550, %216 : tensor<1x1x1x96xf32>
@@ -775,16 +775,16 @@
     %553 = mhlo.multiply %552, %233 : tensor<1x1x1x96xf32>
     %554 = "mhlo.broadcast_in_dim"(%553) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x96xf32>) -> tensor<1x14x14x96xf32>
     %555 = mhlo.multiply %540, %554 : tensor<1x14x14x96xf32>
-    %556 = "mhlo.convolution"(%555, %342) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x96xf32>, tensor<1x1x96x40xf32>) -> tensor<1x14x14x40xf32>
+    %556 = "mhlo.convolution"(%555, %342) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x96xf32>, tensor<1x1x96x40xf32>) -> tensor<1x14x14x40xf32>
     %557 = "mhlo.batch_norm_inference"(%556, %341, %340, %339, %338) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x40xf32>, tensor<40xf32>, tensor<40xf32>, tensor<40xf32>, tensor<40xf32>) -> tensor<1x14x14x40xf32>
-    %558 = "mhlo.convolution"(%557, %356) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x40xf32>, tensor<1x1x40x240xf32>) -> tensor<1x14x14x240xf32>
+    %558 = "mhlo.convolution"(%557, %356) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x40xf32>, tensor<1x1x40x240xf32>) -> tensor<1x14x14x240xf32>
     %559 = "mhlo.batch_norm_inference"(%558, %355, %354, %353, %352) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x240xf32>, tensor<240xf32>, tensor<240xf32>, tensor<240xf32>, tensor<240xf32>) -> tensor<1x14x14x240xf32>
     %560 = mhlo.add %559, %217 : tensor<1x14x14x240xf32>
     %561 = "mhlo.clamp"(%266, %560, %264) : (tensor<f32>, tensor<1x14x14x240xf32>, tensor<f32>) -> tensor<1x14x14x240xf32>
     %562 = mhlo.multiply %561, %234 : tensor<1x14x14x240xf32>
     %563 = mhlo.multiply %562, %559 : tensor<1x14x14x240xf32>
     %564 = "mhlo.reshape"(%351) : (tensor<5x5x240x1xf32>) -> tensor<5x5x1x240xf32>
-    %565 = "mhlo.convolution"(%563, %564) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 240 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<5x5x1x240xf32>) -> tensor<1x14x14x240xf32>
+    %565 = "mhlo.convolution"(%563, %564) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 240 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<5x5x1x240xf32>) -> tensor<1x14x14x240xf32>
     %566 = "mhlo.batch_norm_inference"(%565, %350, %349, %348, %347) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x240xf32>, tensor<240xf32>, tensor<240xf32>, tensor<240xf32>, tensor<240xf32>) -> tensor<1x14x14x240xf32>
     %567 = mhlo.add %566, %217 : tensor<1x14x14x240xf32>
     %568 = "mhlo.clamp"(%266, %567, %264) : (tensor<f32>, tensor<1x14x14x240xf32>, tensor<f32>) -> tensor<1x14x14x240xf32>
@@ -797,11 +797,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<f32>) -> tensor<1x240xf32>
     %572 = mhlo.divide %571, %254 : tensor<1x240xf32>
     %573 = "mhlo.reshape"(%572) : (tensor<1x240xf32>) -> tensor<1x1x1x240xf32>
-    %574 = "mhlo.convolution"(%573, %365) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x240xf32>, tensor<1x1x240x64xf32>) -> tensor<1x1x1x64xf32>
+    %574 = "mhlo.convolution"(%573, %365) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x240xf32>, tensor<1x1x240x64xf32>) -> tensor<1x1x1x64xf32>
     %575 = "mhlo.broadcast_in_dim"(%364) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x1x1x64xf32>
     %576 = mhlo.add %574, %575 : tensor<1x1x1x64xf32>
     %577 = mhlo.maximum %576, %255 : tensor<1x1x1x64xf32>
-    %578 = "mhlo.convolution"(%577, %363) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x64xf32>, tensor<1x1x64x240xf32>) -> tensor<1x1x1x240xf32>
+    %578 = "mhlo.convolution"(%577, %363) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x64xf32>, tensor<1x1x64x240xf32>) -> tensor<1x1x1x240xf32>
     %579 = "mhlo.broadcast_in_dim"(%362) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<240xf32>) -> tensor<1x1x1x240xf32>
     %580 = mhlo.add %578, %579 : tensor<1x1x1x240xf32>
     %581 = mhlo.add %580, %218 : tensor<1x1x1x240xf32>
@@ -809,17 +809,17 @@
     %583 = mhlo.multiply %582, %235 : tensor<1x1x1x240xf32>
     %584 = "mhlo.broadcast_in_dim"(%583) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x240xf32>) -> tensor<1x14x14x240xf32>
     %585 = mhlo.multiply %570, %584 : tensor<1x14x14x240xf32>
-    %586 = "mhlo.convolution"(%585, %361) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<1x1x240x40xf32>) -> tensor<1x14x14x40xf32>
+    %586 = "mhlo.convolution"(%585, %361) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<1x1x240x40xf32>) -> tensor<1x14x14x40xf32>
     %587 = "mhlo.batch_norm_inference"(%586, %360, %359, %358, %357) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x40xf32>, tensor<40xf32>, tensor<40xf32>, tensor<40xf32>, tensor<40xf32>) -> tensor<1x14x14x40xf32>
     %588 = mhlo.add %557, %587 : tensor<1x14x14x40xf32>
-    %589 = "mhlo.convolution"(%588, %375) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x40xf32>, tensor<1x1x40x240xf32>) -> tensor<1x14x14x240xf32>
+    %589 = "mhlo.convolution"(%588, %375) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x40xf32>, tensor<1x1x40x240xf32>) -> tensor<1x14x14x240xf32>
     %590 = "mhlo.batch_norm_inference"(%589, %374, %373, %372, %371) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x240xf32>, tensor<240xf32>, tensor<240xf32>, tensor<240xf32>, tensor<240xf32>) -> tensor<1x14x14x240xf32>
     %591 = mhlo.add %590, %217 : tensor<1x14x14x240xf32>
     %592 = "mhlo.clamp"(%266, %591, %264) : (tensor<f32>, tensor<1x14x14x240xf32>, tensor<f32>) -> tensor<1x14x14x240xf32>
     %593 = mhlo.multiply %592, %234 : tensor<1x14x14x240xf32>
     %594 = mhlo.multiply %593, %590 : tensor<1x14x14x240xf32>
     %595 = "mhlo.reshape"(%370) : (tensor<5x5x240x1xf32>) -> tensor<5x5x1x240xf32>
-    %596 = "mhlo.convolution"(%594, %595) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 240 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<5x5x1x240xf32>) -> tensor<1x14x14x240xf32>
+    %596 = "mhlo.convolution"(%594, %595) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 240 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<5x5x1x240xf32>) -> tensor<1x14x14x240xf32>
     %597 = "mhlo.batch_norm_inference"(%596, %369, %368, %367, %366) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x240xf32>, tensor<240xf32>, tensor<240xf32>, tensor<240xf32>, tensor<240xf32>) -> tensor<1x14x14x240xf32>
     %598 = mhlo.add %597, %217 : tensor<1x14x14x240xf32>
     %599 = "mhlo.clamp"(%266, %598, %264) : (tensor<f32>, tensor<1x14x14x240xf32>, tensor<f32>) -> tensor<1x14x14x240xf32>
@@ -832,11 +832,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<f32>) -> tensor<1x240xf32>
     %603 = mhlo.divide %602, %254 : tensor<1x240xf32>
     %604 = "mhlo.reshape"(%603) : (tensor<1x240xf32>) -> tensor<1x1x1x240xf32>
-    %605 = "mhlo.convolution"(%604, %384) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x240xf32>, tensor<1x1x240x64xf32>) -> tensor<1x1x1x64xf32>
+    %605 = "mhlo.convolution"(%604, %384) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x240xf32>, tensor<1x1x240x64xf32>) -> tensor<1x1x1x64xf32>
     %606 = "mhlo.broadcast_in_dim"(%383) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x1x1x64xf32>
     %607 = mhlo.add %605, %606 : tensor<1x1x1x64xf32>
     %608 = mhlo.maximum %607, %255 : tensor<1x1x1x64xf32>
-    %609 = "mhlo.convolution"(%608, %382) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x64xf32>, tensor<1x1x64x240xf32>) -> tensor<1x1x1x240xf32>
+    %609 = "mhlo.convolution"(%608, %382) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x64xf32>, tensor<1x1x64x240xf32>) -> tensor<1x1x1x240xf32>
     %610 = "mhlo.broadcast_in_dim"(%381) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<240xf32>) -> tensor<1x1x1x240xf32>
     %611 = mhlo.add %609, %610 : tensor<1x1x1x240xf32>
     %612 = mhlo.add %611, %218 : tensor<1x1x1x240xf32>
@@ -844,17 +844,17 @@
     %614 = mhlo.multiply %613, %235 : tensor<1x1x1x240xf32>
     %615 = "mhlo.broadcast_in_dim"(%614) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x240xf32>) -> tensor<1x14x14x240xf32>
     %616 = mhlo.multiply %601, %615 : tensor<1x14x14x240xf32>
-    %617 = "mhlo.convolution"(%616, %380) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<1x1x240x40xf32>) -> tensor<1x14x14x40xf32>
+    %617 = "mhlo.convolution"(%616, %380) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x240xf32>, tensor<1x1x240x40xf32>) -> tensor<1x14x14x40xf32>
     %618 = "mhlo.batch_norm_inference"(%617, %379, %378, %377, %376) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x40xf32>, tensor<40xf32>, tensor<40xf32>, tensor<40xf32>, tensor<40xf32>) -> tensor<1x14x14x40xf32>
     %619 = mhlo.add %588, %618 : tensor<1x14x14x40xf32>
-    %620 = "mhlo.convolution"(%619, %394) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x40xf32>, tensor<1x1x40x120xf32>) -> tensor<1x14x14x120xf32>
+    %620 = "mhlo.convolution"(%619, %394) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x40xf32>, tensor<1x1x40x120xf32>) -> tensor<1x14x14x120xf32>
     %621 = "mhlo.batch_norm_inference"(%620, %393, %392, %391, %390) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x120xf32>, tensor<120xf32>, tensor<120xf32>, tensor<120xf32>, tensor<120xf32>) -> tensor<1x14x14x120xf32>
     %622 = mhlo.add %621, %219 : tensor<1x14x14x120xf32>
     %623 = "mhlo.clamp"(%266, %622, %264) : (tensor<f32>, tensor<1x14x14x120xf32>, tensor<f32>) -> tensor<1x14x14x120xf32>
     %624 = mhlo.multiply %623, %236 : tensor<1x14x14x120xf32>
     %625 = mhlo.multiply %624, %621 : tensor<1x14x14x120xf32>
     %626 = "mhlo.reshape"(%389) : (tensor<5x5x120x1xf32>) -> tensor<5x5x1x120xf32>
-    %627 = "mhlo.convolution"(%625, %626) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 120 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x120xf32>, tensor<5x5x1x120xf32>) -> tensor<1x14x14x120xf32>
+    %627 = "mhlo.convolution"(%625, %626) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 120 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x120xf32>, tensor<5x5x1x120xf32>) -> tensor<1x14x14x120xf32>
     %628 = "mhlo.batch_norm_inference"(%627, %388, %387, %386, %385) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x120xf32>, tensor<120xf32>, tensor<120xf32>, tensor<120xf32>, tensor<120xf32>) -> tensor<1x14x14x120xf32>
     %629 = mhlo.add %628, %219 : tensor<1x14x14x120xf32>
     %630 = "mhlo.clamp"(%266, %629, %264) : (tensor<f32>, tensor<1x14x14x120xf32>, tensor<f32>) -> tensor<1x14x14x120xf32>
@@ -867,11 +867,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x14x14x120xf32>, tensor<f32>) -> tensor<1x120xf32>
     %634 = mhlo.divide %633, %256 : tensor<1x120xf32>
     %635 = "mhlo.reshape"(%634) : (tensor<1x120xf32>) -> tensor<1x1x1x120xf32>
-    %636 = "mhlo.convolution"(%635, %403) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x120xf32>, tensor<1x1x120x32xf32>) -> tensor<1x1x1x32xf32>
+    %636 = "mhlo.convolution"(%635, %403) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x120xf32>, tensor<1x1x120x32xf32>) -> tensor<1x1x1x32xf32>
     %637 = "mhlo.broadcast_in_dim"(%402) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<32xf32>) -> tensor<1x1x1x32xf32>
     %638 = mhlo.add %636, %637 : tensor<1x1x1x32xf32>
     %639 = mhlo.maximum %638, %257 : tensor<1x1x1x32xf32>
-    %640 = "mhlo.convolution"(%639, %401) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x32xf32>, tensor<1x1x32x120xf32>) -> tensor<1x1x1x120xf32>
+    %640 = "mhlo.convolution"(%639, %401) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x32xf32>, tensor<1x1x32x120xf32>) -> tensor<1x1x1x120xf32>
     %641 = "mhlo.broadcast_in_dim"(%400) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<120xf32>) -> tensor<1x1x1x120xf32>
     %642 = mhlo.add %640, %641 : tensor<1x1x1x120xf32>
     %643 = mhlo.add %642, %220 : tensor<1x1x1x120xf32>
@@ -879,16 +879,16 @@
     %645 = mhlo.multiply %644, %237 : tensor<1x1x1x120xf32>
     %646 = "mhlo.broadcast_in_dim"(%645) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x120xf32>) -> tensor<1x14x14x120xf32>
     %647 = mhlo.multiply %632, %646 : tensor<1x14x14x120xf32>
-    %648 = "mhlo.convolution"(%647, %399) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x120xf32>, tensor<1x1x120x48xf32>) -> tensor<1x14x14x48xf32>
+    %648 = "mhlo.convolution"(%647, %399) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x120xf32>, tensor<1x1x120x48xf32>) -> tensor<1x14x14x48xf32>
     %649 = "mhlo.batch_norm_inference"(%648, %398, %397, %396, %395) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x48xf32>, tensor<48xf32>, tensor<48xf32>, tensor<48xf32>, tensor<48xf32>) -> tensor<1x14x14x48xf32>
-    %650 = "mhlo.convolution"(%649, %413) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x48xf32>, tensor<1x1x48x144xf32>) -> tensor<1x14x14x144xf32>
+    %650 = "mhlo.convolution"(%649, %413) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x48xf32>, tensor<1x1x48x144xf32>) -> tensor<1x14x14x144xf32>
     %651 = "mhlo.batch_norm_inference"(%650, %412, %411, %410, %409) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>) -> tensor<1x14x14x144xf32>
     %652 = mhlo.add %651, %221 : tensor<1x14x14x144xf32>
     %653 = "mhlo.clamp"(%266, %652, %264) : (tensor<f32>, tensor<1x14x14x144xf32>, tensor<f32>) -> tensor<1x14x14x144xf32>
     %654 = mhlo.multiply %653, %238 : tensor<1x14x14x144xf32>
     %655 = mhlo.multiply %654, %651 : tensor<1x14x14x144xf32>
     %656 = "mhlo.reshape"(%408) : (tensor<5x5x144x1xf32>) -> tensor<5x5x1x144xf32>
-    %657 = "mhlo.convolution"(%655, %656) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 144 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x144xf32>, tensor<5x5x1x144xf32>) -> tensor<1x14x14x144xf32>
+    %657 = "mhlo.convolution"(%655, %656) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 144 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x144xf32>, tensor<5x5x1x144xf32>) -> tensor<1x14x14x144xf32>
     %658 = "mhlo.batch_norm_inference"(%657, %407, %406, %405, %404) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>, tensor<144xf32>) -> tensor<1x14x14x144xf32>
     %659 = mhlo.add %658, %221 : tensor<1x14x14x144xf32>
     %660 = "mhlo.clamp"(%266, %659, %264) : (tensor<f32>, tensor<1x14x14x144xf32>, tensor<f32>) -> tensor<1x14x14x144xf32>
@@ -901,11 +901,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x14x14x144xf32>, tensor<f32>) -> tensor<1x144xf32>
     %664 = mhlo.divide %663, %258 : tensor<1x144xf32>
     %665 = "mhlo.reshape"(%664) : (tensor<1x144xf32>) -> tensor<1x1x1x144xf32>
-    %666 = "mhlo.convolution"(%665, %422) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x144xf32>, tensor<1x1x144x40xf32>) -> tensor<1x1x1x40xf32>
+    %666 = "mhlo.convolution"(%665, %422) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x144xf32>, tensor<1x1x144x40xf32>) -> tensor<1x1x1x40xf32>
     %667 = "mhlo.broadcast_in_dim"(%421) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<40xf32>) -> tensor<1x1x1x40xf32>
     %668 = mhlo.add %666, %667 : tensor<1x1x1x40xf32>
     %669 = mhlo.maximum %668, %259 : tensor<1x1x1x40xf32>
-    %670 = "mhlo.convolution"(%669, %420) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x40xf32>, tensor<1x1x40x144xf32>) -> tensor<1x1x1x144xf32>
+    %670 = "mhlo.convolution"(%669, %420) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x40xf32>, tensor<1x1x40x144xf32>) -> tensor<1x1x1x144xf32>
     %671 = "mhlo.broadcast_in_dim"(%419) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<144xf32>) -> tensor<1x1x1x144xf32>
     %672 = mhlo.add %670, %671 : tensor<1x1x1x144xf32>
     %673 = mhlo.add %672, %222 : tensor<1x1x1x144xf32>
@@ -913,10 +913,10 @@
     %675 = mhlo.multiply %674, %239 : tensor<1x1x1x144xf32>
     %676 = "mhlo.broadcast_in_dim"(%675) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x144xf32>) -> tensor<1x14x14x144xf32>
     %677 = mhlo.multiply %662, %676 : tensor<1x14x14x144xf32>
-    %678 = "mhlo.convolution"(%677, %418) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x144xf32>, tensor<1x1x144x48xf32>) -> tensor<1x14x14x48xf32>
+    %678 = "mhlo.convolution"(%677, %418) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x144xf32>, tensor<1x1x144x48xf32>) -> tensor<1x14x14x48xf32>
     %679 = "mhlo.batch_norm_inference"(%678, %417, %416, %415, %414) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x48xf32>, tensor<48xf32>, tensor<48xf32>, tensor<48xf32>, tensor<48xf32>) -> tensor<1x14x14x48xf32>
     %680 = mhlo.add %649, %679 : tensor<1x14x14x48xf32>
-    %681 = "mhlo.convolution"(%680, %432) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x48xf32>, tensor<1x1x48x288xf32>) -> tensor<1x14x14x288xf32>
+    %681 = "mhlo.convolution"(%680, %432) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x48xf32>, tensor<1x1x48x288xf32>) -> tensor<1x14x14x288xf32>
     %682 = "mhlo.batch_norm_inference"(%681, %431, %430, %429, %428) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x14x14x288xf32>, tensor<288xf32>, tensor<288xf32>, tensor<288xf32>, tensor<288xf32>) -> tensor<1x14x14x288xf32>
     %683 = mhlo.add %682, %223 : tensor<1x14x14x288xf32>
     %684 = "mhlo.clamp"(%266, %683, %264) : (tensor<f32>, tensor<1x14x14x288xf32>, tensor<f32>) -> tensor<1x14x14x288xf32>
@@ -924,7 +924,7 @@
     %686 = mhlo.multiply %685, %682 : tensor<1x14x14x288xf32>
     %687 = "mhlo.pad"(%686, %266) {edge_padding_high = dense<[0, 2, 2, 0]> : tensor<4xi64>, edge_padding_low = dense<[0, 1, 1, 0]> : tensor<4xi64>, interior_padding = dense<0> : tensor<4xi64>} : (tensor<1x14x14x288xf32>, tensor<f32>) -> tensor<1x17x17x288xf32>
     %688 = "mhlo.reshape"(%427) : (tensor<5x5x288x1xf32>) -> tensor<5x5x1x288xf32>
-    %689 = "mhlo.convolution"(%687, %688) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 288 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x17x17x288xf32>, tensor<5x5x1x288xf32>) -> tensor<1x7x7x288xf32>
+    %689 = "mhlo.convolution"(%687, %688) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 288 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x17x17x288xf32>, tensor<5x5x1x288xf32>) -> tensor<1x7x7x288xf32>
     %690 = "mhlo.batch_norm_inference"(%689, %426, %425, %424, %423) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x288xf32>, tensor<288xf32>, tensor<288xf32>, tensor<288xf32>, tensor<288xf32>) -> tensor<1x7x7x288xf32>
     %691 = mhlo.add %690, %224 : tensor<1x7x7x288xf32>
     %692 = "mhlo.clamp"(%266, %691, %264) : (tensor<f32>, tensor<1x7x7x288xf32>, tensor<f32>) -> tensor<1x7x7x288xf32>
@@ -937,11 +937,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x7x7x288xf32>, tensor<f32>) -> tensor<1x288xf32>
     %696 = mhlo.divide %695, %260 : tensor<1x288xf32>
     %697 = "mhlo.reshape"(%696) : (tensor<1x288xf32>) -> tensor<1x1x1x288xf32>
-    %698 = "mhlo.convolution"(%697, %441) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x288xf32>, tensor<1x1x288x72xf32>) -> tensor<1x1x1x72xf32>
+    %698 = "mhlo.convolution"(%697, %441) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x288xf32>, tensor<1x1x288x72xf32>) -> tensor<1x1x1x72xf32>
     %699 = "mhlo.broadcast_in_dim"(%440) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<72xf32>) -> tensor<1x1x1x72xf32>
     %700 = mhlo.add %698, %699 : tensor<1x1x1x72xf32>
     %701 = mhlo.maximum %700, %261 : tensor<1x1x1x72xf32>
-    %702 = "mhlo.convolution"(%701, %439) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x72xf32>, tensor<1x1x72x288xf32>) -> tensor<1x1x1x288xf32>
+    %702 = "mhlo.convolution"(%701, %439) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x72xf32>, tensor<1x1x72x288xf32>) -> tensor<1x1x1x288xf32>
     %703 = "mhlo.broadcast_in_dim"(%438) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<288xf32>) -> tensor<1x1x1x288xf32>
     %704 = mhlo.add %702, %703 : tensor<1x1x1x288xf32>
     %705 = mhlo.add %704, %225 : tensor<1x1x1x288xf32>
@@ -949,16 +949,16 @@
     %707 = mhlo.multiply %706, %242 : tensor<1x1x1x288xf32>
     %708 = "mhlo.broadcast_in_dim"(%707) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x288xf32>) -> tensor<1x7x7x288xf32>
     %709 = mhlo.multiply %694, %708 : tensor<1x7x7x288xf32>
-    %710 = "mhlo.convolution"(%709, %437) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x288xf32>, tensor<1x1x288x96xf32>) -> tensor<1x7x7x96xf32>
+    %710 = "mhlo.convolution"(%709, %437) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x288xf32>, tensor<1x1x288x96xf32>) -> tensor<1x7x7x96xf32>
     %711 = "mhlo.batch_norm_inference"(%710, %436, %435, %434, %433) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) -> tensor<1x7x7x96xf32>
-    %712 = "mhlo.convolution"(%711, %451) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x7x7x576xf32>
+    %712 = "mhlo.convolution"(%711, %451) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x7x7x576xf32>
     %713 = "mhlo.batch_norm_inference"(%712, %450, %449, %448, %447) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) -> tensor<1x7x7x576xf32>
     %714 = mhlo.add %713, %227 : tensor<1x7x7x576xf32>
     %715 = "mhlo.clamp"(%266, %714, %264) : (tensor<f32>, tensor<1x7x7x576xf32>, tensor<f32>) -> tensor<1x7x7x576xf32>
     %716 = mhlo.multiply %715, %244 : tensor<1x7x7x576xf32>
     %717 = mhlo.multiply %716, %713 : tensor<1x7x7x576xf32>
     %718 = "mhlo.reshape"(%446) : (tensor<5x5x576x1xf32>) -> tensor<5x5x1x576xf32>
-    %719 = "mhlo.convolution"(%717, %718) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 576 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<5x5x1x576xf32>) -> tensor<1x7x7x576xf32>
+    %719 = "mhlo.convolution"(%717, %718) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 576 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<5x5x1x576xf32>) -> tensor<1x7x7x576xf32>
     %720 = "mhlo.batch_norm_inference"(%719, %445, %444, %443, %442) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) -> tensor<1x7x7x576xf32>
     %721 = mhlo.add %720, %227 : tensor<1x7x7x576xf32>
     %722 = "mhlo.clamp"(%266, %721, %264) : (tensor<f32>, tensor<1x7x7x576xf32>, tensor<f32>) -> tensor<1x7x7x576xf32>
@@ -971,11 +971,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<f32>) -> tensor<1x576xf32>
     %726 = mhlo.divide %725, %263 : tensor<1x576xf32>
     %727 = "mhlo.reshape"(%726) : (tensor<1x576xf32>) -> tensor<1x1x1x576xf32>
-    %728 = "mhlo.convolution"(%727, %460) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x576xf32>, tensor<1x1x576x144xf32>) -> tensor<1x1x1x144xf32>
+    %728 = "mhlo.convolution"(%727, %460) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x576xf32>, tensor<1x1x576x144xf32>) -> tensor<1x1x1x144xf32>
     %729 = "mhlo.broadcast_in_dim"(%459) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<144xf32>) -> tensor<1x1x1x144xf32>
     %730 = mhlo.add %728, %729 : tensor<1x1x1x144xf32>
     %731 = mhlo.maximum %730, %262 : tensor<1x1x1x144xf32>
-    %732 = "mhlo.convolution"(%731, %458) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x144xf32>, tensor<1x1x144x576xf32>) -> tensor<1x1x1x576xf32>
+    %732 = "mhlo.convolution"(%731, %458) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x144xf32>, tensor<1x1x144x576xf32>) -> tensor<1x1x1x576xf32>
     %733 = "mhlo.broadcast_in_dim"(%457) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<576xf32>) -> tensor<1x1x1x576xf32>
     %734 = mhlo.add %732, %733 : tensor<1x1x1x576xf32>
     %735 = mhlo.add %734, %226 : tensor<1x1x1x576xf32>
@@ -983,17 +983,17 @@
     %737 = mhlo.multiply %736, %243 : tensor<1x1x1x576xf32>
     %738 = "mhlo.broadcast_in_dim"(%737) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x576xf32>) -> tensor<1x7x7x576xf32>
     %739 = mhlo.multiply %724, %738 : tensor<1x7x7x576xf32>
-    %740 = "mhlo.convolution"(%739, %456) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<1x1x576x96xf32>) -> tensor<1x7x7x96xf32>
+    %740 = "mhlo.convolution"(%739, %456) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<1x1x576x96xf32>) -> tensor<1x7x7x96xf32>
     %741 = "mhlo.batch_norm_inference"(%740, %455, %454, %453, %452) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) -> tensor<1x7x7x96xf32>
     %742 = mhlo.add %711, %741 : tensor<1x7x7x96xf32>
-    %743 = "mhlo.convolution"(%742, %288) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x7x7x576xf32>
+    %743 = "mhlo.convolution"(%742, %288) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x7x7x576xf32>
     %744 = "mhlo.batch_norm_inference"(%743, %287, %286, %285, %284) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) -> tensor<1x7x7x576xf32>
     %745 = mhlo.add %744, %227 : tensor<1x7x7x576xf32>
     %746 = "mhlo.clamp"(%266, %745, %264) : (tensor<f32>, tensor<1x7x7x576xf32>, tensor<f32>) -> tensor<1x7x7x576xf32>
     %747 = mhlo.multiply %746, %244 : tensor<1x7x7x576xf32>
     %748 = mhlo.multiply %747, %744 : tensor<1x7x7x576xf32>
     %749 = "mhlo.reshape"(%283) : (tensor<5x5x576x1xf32>) -> tensor<5x5x1x576xf32>
-    %750 = "mhlo.convolution"(%748, %749) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 576 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<5x5x1x576xf32>) -> tensor<1x7x7x576xf32>
+    %750 = "mhlo.convolution"(%748, %749) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 576 : i64, padding = dense<2> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<5x5x1x576xf32>) -> tensor<1x7x7x576xf32>
     %751 = "mhlo.batch_norm_inference"(%750, %282, %281, %280, %279) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) -> tensor<1x7x7x576xf32>
     %752 = mhlo.add %751, %227 : tensor<1x7x7x576xf32>
     %753 = "mhlo.clamp"(%266, %752, %264) : (tensor<f32>, tensor<1x7x7x576xf32>, tensor<f32>) -> tensor<1x7x7x576xf32>
@@ -1006,11 +1006,11 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<f32>) -> tensor<1x576xf32>
     %757 = mhlo.divide %756, %263 : tensor<1x576xf32>
     %758 = "mhlo.reshape"(%757) : (tensor<1x576xf32>) -> tensor<1x1x1x576xf32>
-    %759 = "mhlo.convolution"(%758, %297) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x576xf32>, tensor<1x1x576x144xf32>) -> tensor<1x1x1x144xf32>
+    %759 = "mhlo.convolution"(%758, %297) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x576xf32>, tensor<1x1x576x144xf32>) -> tensor<1x1x1x144xf32>
     %760 = "mhlo.broadcast_in_dim"(%296) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<144xf32>) -> tensor<1x1x1x144xf32>
     %761 = mhlo.add %759, %760 : tensor<1x1x1x144xf32>
     %762 = mhlo.maximum %761, %262 : tensor<1x1x1x144xf32>
-    %763 = "mhlo.convolution"(%762, %295) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x144xf32>, tensor<1x1x144x576xf32>) -> tensor<1x1x1x576xf32>
+    %763 = "mhlo.convolution"(%762, %295) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x144xf32>, tensor<1x1x144x576xf32>) -> tensor<1x1x1x576xf32>
     %764 = "mhlo.broadcast_in_dim"(%294) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<576xf32>) -> tensor<1x1x1x576xf32>
     %765 = mhlo.add %763, %764 : tensor<1x1x1x576xf32>
     %766 = mhlo.add %765, %226 : tensor<1x1x1x576xf32>
@@ -1018,10 +1018,10 @@
     %768 = mhlo.multiply %767, %243 : tensor<1x1x1x576xf32>
     %769 = "mhlo.broadcast_in_dim"(%768) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x1x576xf32>) -> tensor<1x7x7x576xf32>
     %770 = mhlo.multiply %755, %769 : tensor<1x7x7x576xf32>
-    %771 = "mhlo.convolution"(%770, %293) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<1x1x576x96xf32>) -> tensor<1x7x7x96xf32>
+    %771 = "mhlo.convolution"(%770, %293) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<1x1x576x96xf32>) -> tensor<1x7x7x96xf32>
     %772 = "mhlo.batch_norm_inference"(%771, %292, %291, %290, %289) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>, tensor<96xf32>) -> tensor<1x7x7x96xf32>
     %773 = mhlo.add %742, %772 : tensor<1x7x7x96xf32>
-    %774 = "mhlo.convolution"(%773, %271) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x7x7x576xf32>
+    %774 = "mhlo.convolution"(%773, %271) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x96xf32>, tensor<1x1x96x576xf32>) -> tensor<1x7x7x576xf32>
     %775 = "mhlo.batch_norm_inference"(%774, %270, %269, %268, %267) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<1x7x7x576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) -> tensor<1x7x7x576xf32>
     %776 = mhlo.add %775, %227 : tensor<1x7x7x576xf32>
     %777 = "mhlo.clamp"(%266, %776, %264) : (tensor<f32>, tensor<1x7x7x576xf32>, tensor<f32>) -> tensor<1x7x7x576xf32>
@@ -1034,14 +1034,14 @@
     }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x7x7x576xf32>, tensor<f32>) -> tensor<1x576xf32>
     %781 = mhlo.divide %780, %263 : tensor<1x576xf32>
     %782 = "mhlo.reshape"(%781) : (tensor<1x576xf32>) -> tensor<1x1x1x576xf32>
-    %783 = "mhlo.convolution"(%782, %273) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x576xf32>, tensor<1x1x576x1024xf32>) -> tensor<1x1x1x1024xf32>
+    %783 = "mhlo.convolution"(%782, %273) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x576xf32>, tensor<1x1x576x1024xf32>) -> tensor<1x1x1x1024xf32>
     %784 = "mhlo.broadcast_in_dim"(%272) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1024xf32>) -> tensor<1x1x1x1024xf32>
     %785 = mhlo.add %783, %784 : tensor<1x1x1x1024xf32>
     %786 = mhlo.add %785, %228 : tensor<1x1x1x1024xf32>
     %787 = "mhlo.clamp"(%266, %786, %264) : (tensor<f32>, tensor<1x1x1x1024xf32>, tensor<f32>) -> tensor<1x1x1x1024xf32>
     %788 = mhlo.multiply %787, %245 : tensor<1x1x1x1024xf32>
     %789 = mhlo.multiply %788, %785 : tensor<1x1x1x1024xf32>
-    %790 = "mhlo.convolution"(%789, %476) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x1024xf32>, tensor<1x1x1024x1000xf32>) -> tensor<1x1x1x1000xf32>
+    %790 = "mhlo.convolution"(%789, %476) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x1x1024xf32>, tensor<1x1x1024x1000xf32>) -> tensor<1x1x1x1000xf32>
     %791 = "mhlo.broadcast_in_dim"(%475) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1000xf32>) -> tensor<1x1x1x1000xf32>
     %792 = mhlo.add %790, %791 : tensor<1x1x1x1000xf32>
     %793 = "mhlo.reshape"(%792) : (tensor<1x1x1x1000xf32>) -> tensor<1x1000xf32>
diff --git a/iree/test/e2e/models/resnet50_fake_weights.mlir b/iree/test/e2e/models/resnet50_fake_weights.mlir
index 764d1fa..96192e3 100644
--- a/iree/test/e2e/models/resnet50_fake_weights.mlir
+++ b/iree/test/e2e/models/resnet50_fake_weights.mlir
@@ -980,7 +980,7 @@
     %650 = util.global.load.indirect %319 : !util.ptr<tensor<1000xf32>> -> tensor<1000xf32>
     %651 = util.global.load.indirect %318 : !util.ptr<tensor<2048x1000xf32>> -> tensor<2048x1000xf32>
     %652 = "mhlo.pad"(%arg0, %331) {edge_padding_high = dense<[0, 3, 3, 0]> : tensor<4xi64>, edge_padding_low = dense<[0, 3, 3, 0]> : tensor<4xi64>, interior_padding = dense<0> : tensor<4xi64>} : (tensor<1x224x224x3xf32>, tensor<f32>) -> tensor<1x230x230x3xf32>
-    %653 = "mhlo.convolution"(%652, %337) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x230x230x3xf32>, tensor<7x7x3x64xf32>) -> tensor<1x112x112x64xf32>
+    %653 = "mhlo.convolution"(%652, %337) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x230x230x3xf32>, tensor<7x7x3x64xf32>) -> tensor<1x112x112x64xf32>
     %654 = "mhlo.broadcast_in_dim"(%336) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x112x112x64xf32>
     %655 = mhlo.add %653, %654 : tensor<1x112x112x64xf32>
     %656 = "mhlo.batch_norm_inference"(%655, %335, %334, %333, %332) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x112x112x64xf32>
@@ -991,273 +991,273 @@
       %944 = mhlo.maximum %arg1, %arg2 : tensor<f32>
       "mhlo.return"(%944) : (tensor<f32>) -> ()
     }) {window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<1x114x114x64xf32>, tensor<f32>) -> tensor<1x56x56x64xf32>
-    %660 = "mhlo.convolution"(%659, %343) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<1x56x56x256xf32>
+    %660 = "mhlo.convolution"(%659, %343) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<1x56x56x256xf32>
     %661 = "mhlo.broadcast_in_dim"(%342) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x56x56x256xf32>
     %662 = mhlo.add %660, %661 : tensor<1x56x56x256xf32>
     %663 = "mhlo.batch_norm_inference"(%662, %341, %340, %339, %338) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x56x56x256xf32>
-    %664 = "mhlo.convolution"(%659, %349) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x64xf32>) -> tensor<1x56x56x64xf32>
+    %664 = "mhlo.convolution"(%659, %349) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x64xf32>) -> tensor<1x56x56x64xf32>
     %665 = "mhlo.broadcast_in_dim"(%348) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %666 = mhlo.add %664, %665 : tensor<1x56x56x64xf32>
     %667 = "mhlo.batch_norm_inference"(%666, %347, %346, %345, %344) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %668 = mhlo.maximum %667, %321 : tensor<1x56x56x64xf32>
-    %669 = "mhlo.convolution"(%668, %355) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<3x3x64x64xf32>) -> tensor<1x56x56x64xf32>
+    %669 = "mhlo.convolution"(%668, %355) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<3x3x64x64xf32>) -> tensor<1x56x56x64xf32>
     %670 = "mhlo.broadcast_in_dim"(%354) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %671 = mhlo.add %669, %670 : tensor<1x56x56x64xf32>
     %672 = "mhlo.batch_norm_inference"(%671, %353, %352, %351, %350) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %673 = mhlo.maximum %672, %321 : tensor<1x56x56x64xf32>
-    %674 = "mhlo.convolution"(%673, %361) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<1x56x56x256xf32>
+    %674 = "mhlo.convolution"(%673, %361) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<1x56x56x256xf32>
     %675 = "mhlo.broadcast_in_dim"(%360) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x56x56x256xf32>
     %676 = mhlo.add %674, %675 : tensor<1x56x56x256xf32>
     %677 = "mhlo.batch_norm_inference"(%676, %359, %358, %357, %356) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x56x56x256xf32>
     %678 = mhlo.add %663, %677 : tensor<1x56x56x256xf32>
     %679 = mhlo.maximum %678, %322 : tensor<1x56x56x256xf32>
-    %680 = "mhlo.convolution"(%679, %367) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x256xf32>, tensor<1x1x256x64xf32>) -> tensor<1x56x56x64xf32>
+    %680 = "mhlo.convolution"(%679, %367) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x256xf32>, tensor<1x1x256x64xf32>) -> tensor<1x56x56x64xf32>
     %681 = "mhlo.broadcast_in_dim"(%366) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %682 = mhlo.add %680, %681 : tensor<1x56x56x64xf32>
     %683 = "mhlo.batch_norm_inference"(%682, %365, %364, %363, %362) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %684 = mhlo.maximum %683, %321 : tensor<1x56x56x64xf32>
-    %685 = "mhlo.convolution"(%684, %373) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<3x3x64x64xf32>) -> tensor<1x56x56x64xf32>
+    %685 = "mhlo.convolution"(%684, %373) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<3x3x64x64xf32>) -> tensor<1x56x56x64xf32>
     %686 = "mhlo.broadcast_in_dim"(%372) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %687 = mhlo.add %685, %686 : tensor<1x56x56x64xf32>
     %688 = "mhlo.batch_norm_inference"(%687, %371, %370, %369, %368) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %689 = mhlo.maximum %688, %321 : tensor<1x56x56x64xf32>
-    %690 = "mhlo.convolution"(%689, %379) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<1x56x56x256xf32>
+    %690 = "mhlo.convolution"(%689, %379) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<1x56x56x256xf32>
     %691 = "mhlo.broadcast_in_dim"(%378) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x56x56x256xf32>
     %692 = mhlo.add %690, %691 : tensor<1x56x56x256xf32>
     %693 = "mhlo.batch_norm_inference"(%692, %377, %376, %375, %374) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x56x56x256xf32>
     %694 = mhlo.add %679, %693 : tensor<1x56x56x256xf32>
     %695 = mhlo.maximum %694, %322 : tensor<1x56x56x256xf32>
-    %696 = "mhlo.convolution"(%695, %385) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x256xf32>, tensor<1x1x256x64xf32>) -> tensor<1x56x56x64xf32>
+    %696 = "mhlo.convolution"(%695, %385) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x256xf32>, tensor<1x1x256x64xf32>) -> tensor<1x56x56x64xf32>
     %697 = "mhlo.broadcast_in_dim"(%384) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %698 = mhlo.add %696, %697 : tensor<1x56x56x64xf32>
     %699 = "mhlo.batch_norm_inference"(%698, %383, %382, %381, %380) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %700 = mhlo.maximum %699, %321 : tensor<1x56x56x64xf32>
-    %701 = "mhlo.convolution"(%700, %391) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<3x3x64x64xf32>) -> tensor<1x56x56x64xf32>
+    %701 = "mhlo.convolution"(%700, %391) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<3x3x64x64xf32>) -> tensor<1x56x56x64xf32>
     %702 = "mhlo.broadcast_in_dim"(%390) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %703 = mhlo.add %701, %702 : tensor<1x56x56x64xf32>
     %704 = "mhlo.batch_norm_inference"(%703, %389, %388, %387, %386) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<1x56x56x64xf32>
     %705 = mhlo.maximum %704, %321 : tensor<1x56x56x64xf32>
-    %706 = "mhlo.convolution"(%705, %397) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<1x56x56x256xf32>
+    %706 = "mhlo.convolution"(%705, %397) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<1x56x56x256xf32>
     %707 = "mhlo.broadcast_in_dim"(%396) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x56x56x256xf32>
     %708 = mhlo.add %706, %707 : tensor<1x56x56x256xf32>
     %709 = "mhlo.batch_norm_inference"(%708, %395, %394, %393, %392) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x56x56x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x56x56x256xf32>
     %710 = mhlo.add %695, %709 : tensor<1x56x56x256xf32>
     %711 = mhlo.maximum %710, %322 : tensor<1x56x56x256xf32>
-    %712 = "mhlo.convolution"(%711, %403) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x56x56x256xf32>, tensor<1x1x256x512xf32>) -> tensor<1x28x28x512xf32>
+    %712 = "mhlo.convolution"(%711, %403) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x56x56x256xf32>, tensor<1x1x256x512xf32>) -> tensor<1x28x28x512xf32>
     %713 = "mhlo.broadcast_in_dim"(%402) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %714 = mhlo.add %712, %713 : tensor<1x28x28x512xf32>
     %715 = "mhlo.batch_norm_inference"(%714, %401, %400, %399, %398) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x28x28x512xf32>
-    %716 = "mhlo.convolution"(%711, %409) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x56x56x256xf32>, tensor<1x1x256x128xf32>) -> tensor<1x28x28x128xf32>
+    %716 = "mhlo.convolution"(%711, %409) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x56x56x256xf32>, tensor<1x1x256x128xf32>) -> tensor<1x28x28x128xf32>
     %717 = "mhlo.broadcast_in_dim"(%408) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %718 = mhlo.add %716, %717 : tensor<1x28x28x128xf32>
     %719 = "mhlo.batch_norm_inference"(%718, %407, %406, %405, %404) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %720 = mhlo.maximum %719, %323 : tensor<1x28x28x128xf32>
-    %721 = "mhlo.convolution"(%720, %415) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<3x3x128x128xf32>) -> tensor<1x28x28x128xf32>
+    %721 = "mhlo.convolution"(%720, %415) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<3x3x128x128xf32>) -> tensor<1x28x28x128xf32>
     %722 = "mhlo.broadcast_in_dim"(%414) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %723 = mhlo.add %721, %722 : tensor<1x28x28x128xf32>
     %724 = "mhlo.batch_norm_inference"(%723, %413, %412, %411, %410) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %725 = mhlo.maximum %724, %323 : tensor<1x28x28x128xf32>
-    %726 = "mhlo.convolution"(%725, %421) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<1x1x128x512xf32>) -> tensor<1x28x28x512xf32>
+    %726 = "mhlo.convolution"(%725, %421) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<1x1x128x512xf32>) -> tensor<1x28x28x512xf32>
     %727 = "mhlo.broadcast_in_dim"(%420) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %728 = mhlo.add %726, %727 : tensor<1x28x28x512xf32>
     %729 = "mhlo.batch_norm_inference"(%728, %419, %418, %417, %416) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %730 = mhlo.add %715, %729 : tensor<1x28x28x512xf32>
     %731 = mhlo.maximum %730, %324 : tensor<1x28x28x512xf32>
-    %732 = "mhlo.convolution"(%731, %427) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x128xf32>) -> tensor<1x28x28x128xf32>
+    %732 = "mhlo.convolution"(%731, %427) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x128xf32>) -> tensor<1x28x28x128xf32>
     %733 = "mhlo.broadcast_in_dim"(%426) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %734 = mhlo.add %732, %733 : tensor<1x28x28x128xf32>
     %735 = "mhlo.batch_norm_inference"(%734, %425, %424, %423, %422) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %736 = mhlo.maximum %735, %323 : tensor<1x28x28x128xf32>
-    %737 = "mhlo.convolution"(%736, %433) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<3x3x128x128xf32>) -> tensor<1x28x28x128xf32>
+    %737 = "mhlo.convolution"(%736, %433) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<3x3x128x128xf32>) -> tensor<1x28x28x128xf32>
     %738 = "mhlo.broadcast_in_dim"(%432) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %739 = mhlo.add %737, %738 : tensor<1x28x28x128xf32>
     %740 = "mhlo.batch_norm_inference"(%739, %431, %430, %429, %428) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %741 = mhlo.maximum %740, %323 : tensor<1x28x28x128xf32>
-    %742 = "mhlo.convolution"(%741, %439) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<1x1x128x512xf32>) -> tensor<1x28x28x512xf32>
+    %742 = "mhlo.convolution"(%741, %439) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<1x1x128x512xf32>) -> tensor<1x28x28x512xf32>
     %743 = "mhlo.broadcast_in_dim"(%438) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %744 = mhlo.add %742, %743 : tensor<1x28x28x512xf32>
     %745 = "mhlo.batch_norm_inference"(%744, %437, %436, %435, %434) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %746 = mhlo.add %731, %745 : tensor<1x28x28x512xf32>
     %747 = mhlo.maximum %746, %324 : tensor<1x28x28x512xf32>
-    %748 = "mhlo.convolution"(%747, %445) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x128xf32>) -> tensor<1x28x28x128xf32>
+    %748 = "mhlo.convolution"(%747, %445) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x128xf32>) -> tensor<1x28x28x128xf32>
     %749 = "mhlo.broadcast_in_dim"(%444) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %750 = mhlo.add %748, %749 : tensor<1x28x28x128xf32>
     %751 = "mhlo.batch_norm_inference"(%750, %443, %442, %441, %440) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %752 = mhlo.maximum %751, %323 : tensor<1x28x28x128xf32>
-    %753 = "mhlo.convolution"(%752, %451) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<3x3x128x128xf32>) -> tensor<1x28x28x128xf32>
+    %753 = "mhlo.convolution"(%752, %451) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<3x3x128x128xf32>) -> tensor<1x28x28x128xf32>
     %754 = "mhlo.broadcast_in_dim"(%450) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %755 = mhlo.add %753, %754 : tensor<1x28x28x128xf32>
     %756 = "mhlo.batch_norm_inference"(%755, %449, %448, %447, %446) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %757 = mhlo.maximum %756, %323 : tensor<1x28x28x128xf32>
-    %758 = "mhlo.convolution"(%757, %457) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<1x1x128x512xf32>) -> tensor<1x28x28x512xf32>
+    %758 = "mhlo.convolution"(%757, %457) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<1x1x128x512xf32>) -> tensor<1x28x28x512xf32>
     %759 = "mhlo.broadcast_in_dim"(%456) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %760 = mhlo.add %758, %759 : tensor<1x28x28x512xf32>
     %761 = "mhlo.batch_norm_inference"(%760, %455, %454, %453, %452) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %762 = mhlo.add %747, %761 : tensor<1x28x28x512xf32>
     %763 = mhlo.maximum %762, %324 : tensor<1x28x28x512xf32>
-    %764 = "mhlo.convolution"(%763, %463) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x128xf32>) -> tensor<1x28x28x128xf32>
+    %764 = "mhlo.convolution"(%763, %463) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x128xf32>) -> tensor<1x28x28x128xf32>
     %765 = "mhlo.broadcast_in_dim"(%462) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %766 = mhlo.add %764, %765 : tensor<1x28x28x128xf32>
     %767 = "mhlo.batch_norm_inference"(%766, %461, %460, %459, %458) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %768 = mhlo.maximum %767, %323 : tensor<1x28x28x128xf32>
-    %769 = "mhlo.convolution"(%768, %469) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<3x3x128x128xf32>) -> tensor<1x28x28x128xf32>
+    %769 = "mhlo.convolution"(%768, %469) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<3x3x128x128xf32>) -> tensor<1x28x28x128xf32>
     %770 = "mhlo.broadcast_in_dim"(%468) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %771 = mhlo.add %769, %770 : tensor<1x28x28x128xf32>
     %772 = "mhlo.batch_norm_inference"(%771, %467, %466, %465, %464) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<1x28x28x128xf32>
     %773 = mhlo.maximum %772, %323 : tensor<1x28x28x128xf32>
-    %774 = "mhlo.convolution"(%773, %475) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<1x1x128x512xf32>) -> tensor<1x28x28x512xf32>
+    %774 = "mhlo.convolution"(%773, %475) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x28x28x128xf32>, tensor<1x1x128x512xf32>) -> tensor<1x28x28x512xf32>
     %775 = "mhlo.broadcast_in_dim"(%474) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %776 = mhlo.add %774, %775 : tensor<1x28x28x512xf32>
     %777 = "mhlo.batch_norm_inference"(%776, %473, %472, %471, %470) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x28x28x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x28x28x512xf32>
     %778 = mhlo.add %763, %777 : tensor<1x28x28x512xf32>
     %779 = mhlo.maximum %778, %324 : tensor<1x28x28x512xf32>
-    %780 = "mhlo.convolution"(%779, %481) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x1024xf32>) -> tensor<1x14x14x1024xf32>
+    %780 = "mhlo.convolution"(%779, %481) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x1024xf32>) -> tensor<1x14x14x1024xf32>
     %781 = "mhlo.broadcast_in_dim"(%480) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %782 = mhlo.add %780, %781 : tensor<1x14x14x1024xf32>
     %783 = "mhlo.batch_norm_inference"(%782, %479, %478, %477, %476) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
-    %784 = "mhlo.convolution"(%779, %487) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x256xf32>) -> tensor<1x14x14x256xf32>
+    %784 = "mhlo.convolution"(%779, %487) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x28x28x512xf32>, tensor<1x1x512x256xf32>) -> tensor<1x14x14x256xf32>
     %785 = "mhlo.broadcast_in_dim"(%486) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %786 = mhlo.add %784, %785 : tensor<1x14x14x256xf32>
     %787 = "mhlo.batch_norm_inference"(%786, %485, %484, %483, %482) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %788 = mhlo.maximum %787, %325 : tensor<1x14x14x256xf32>
-    %789 = "mhlo.convolution"(%788, %493) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
+    %789 = "mhlo.convolution"(%788, %493) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
     %790 = "mhlo.broadcast_in_dim"(%492) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %791 = mhlo.add %789, %790 : tensor<1x14x14x256xf32>
     %792 = "mhlo.batch_norm_inference"(%791, %491, %490, %489, %488) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %793 = mhlo.maximum %792, %325 : tensor<1x14x14x256xf32>
-    %794 = "mhlo.convolution"(%793, %499) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
+    %794 = "mhlo.convolution"(%793, %499) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
     %795 = "mhlo.broadcast_in_dim"(%498) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %796 = mhlo.add %794, %795 : tensor<1x14x14x1024xf32>
     %797 = "mhlo.batch_norm_inference"(%796, %497, %496, %495, %494) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %798 = mhlo.add %783, %797 : tensor<1x14x14x1024xf32>
     %799 = mhlo.maximum %798, %326 : tensor<1x14x14x1024xf32>
-    %800 = "mhlo.convolution"(%799, %505) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
+    %800 = "mhlo.convolution"(%799, %505) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
     %801 = "mhlo.broadcast_in_dim"(%504) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %802 = mhlo.add %800, %801 : tensor<1x14x14x256xf32>
     %803 = "mhlo.batch_norm_inference"(%802, %503, %502, %501, %500) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %804 = mhlo.maximum %803, %325 : tensor<1x14x14x256xf32>
-    %805 = "mhlo.convolution"(%804, %511) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
+    %805 = "mhlo.convolution"(%804, %511) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
     %806 = "mhlo.broadcast_in_dim"(%510) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %807 = mhlo.add %805, %806 : tensor<1x14x14x256xf32>
     %808 = "mhlo.batch_norm_inference"(%807, %509, %508, %507, %506) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %809 = mhlo.maximum %808, %325 : tensor<1x14x14x256xf32>
-    %810 = "mhlo.convolution"(%809, %517) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
+    %810 = "mhlo.convolution"(%809, %517) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
     %811 = "mhlo.broadcast_in_dim"(%516) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %812 = mhlo.add %810, %811 : tensor<1x14x14x1024xf32>
     %813 = "mhlo.batch_norm_inference"(%812, %515, %514, %513, %512) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %814 = mhlo.add %799, %813 : tensor<1x14x14x1024xf32>
     %815 = mhlo.maximum %814, %326 : tensor<1x14x14x1024xf32>
-    %816 = "mhlo.convolution"(%815, %523) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
+    %816 = "mhlo.convolution"(%815, %523) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
     %817 = "mhlo.broadcast_in_dim"(%522) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %818 = mhlo.add %816, %817 : tensor<1x14x14x256xf32>
     %819 = "mhlo.batch_norm_inference"(%818, %521, %520, %519, %518) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %820 = mhlo.maximum %819, %325 : tensor<1x14x14x256xf32>
-    %821 = "mhlo.convolution"(%820, %529) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
+    %821 = "mhlo.convolution"(%820, %529) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
     %822 = "mhlo.broadcast_in_dim"(%528) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %823 = mhlo.add %821, %822 : tensor<1x14x14x256xf32>
     %824 = "mhlo.batch_norm_inference"(%823, %527, %526, %525, %524) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %825 = mhlo.maximum %824, %325 : tensor<1x14x14x256xf32>
-    %826 = "mhlo.convolution"(%825, %535) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
+    %826 = "mhlo.convolution"(%825, %535) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
     %827 = "mhlo.broadcast_in_dim"(%534) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %828 = mhlo.add %826, %827 : tensor<1x14x14x1024xf32>
     %829 = "mhlo.batch_norm_inference"(%828, %533, %532, %531, %530) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %830 = mhlo.add %815, %829 : tensor<1x14x14x1024xf32>
     %831 = mhlo.maximum %830, %326 : tensor<1x14x14x1024xf32>
-    %832 = "mhlo.convolution"(%831, %541) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
+    %832 = "mhlo.convolution"(%831, %541) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
     %833 = "mhlo.broadcast_in_dim"(%540) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %834 = mhlo.add %832, %833 : tensor<1x14x14x256xf32>
     %835 = "mhlo.batch_norm_inference"(%834, %539, %538, %537, %536) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %836 = mhlo.maximum %835, %325 : tensor<1x14x14x256xf32>
-    %837 = "mhlo.convolution"(%836, %547) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
+    %837 = "mhlo.convolution"(%836, %547) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
     %838 = "mhlo.broadcast_in_dim"(%546) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %839 = mhlo.add %837, %838 : tensor<1x14x14x256xf32>
     %840 = "mhlo.batch_norm_inference"(%839, %545, %544, %543, %542) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %841 = mhlo.maximum %840, %325 : tensor<1x14x14x256xf32>
-    %842 = "mhlo.convolution"(%841, %553) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
+    %842 = "mhlo.convolution"(%841, %553) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
     %843 = "mhlo.broadcast_in_dim"(%552) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %844 = mhlo.add %842, %843 : tensor<1x14x14x1024xf32>
     %845 = "mhlo.batch_norm_inference"(%844, %551, %550, %549, %548) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %846 = mhlo.add %831, %845 : tensor<1x14x14x1024xf32>
     %847 = mhlo.maximum %846, %326 : tensor<1x14x14x1024xf32>
-    %848 = "mhlo.convolution"(%847, %559) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
+    %848 = "mhlo.convolution"(%847, %559) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
     %849 = "mhlo.broadcast_in_dim"(%558) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %850 = mhlo.add %848, %849 : tensor<1x14x14x256xf32>
     %851 = "mhlo.batch_norm_inference"(%850, %557, %556, %555, %554) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %852 = mhlo.maximum %851, %325 : tensor<1x14x14x256xf32>
-    %853 = "mhlo.convolution"(%852, %565) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
+    %853 = "mhlo.convolution"(%852, %565) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
     %854 = "mhlo.broadcast_in_dim"(%564) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %855 = mhlo.add %853, %854 : tensor<1x14x14x256xf32>
     %856 = "mhlo.batch_norm_inference"(%855, %563, %562, %561, %560) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %857 = mhlo.maximum %856, %325 : tensor<1x14x14x256xf32>
-    %858 = "mhlo.convolution"(%857, %571) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
+    %858 = "mhlo.convolution"(%857, %571) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
     %859 = "mhlo.broadcast_in_dim"(%570) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %860 = mhlo.add %858, %859 : tensor<1x14x14x1024xf32>
     %861 = "mhlo.batch_norm_inference"(%860, %569, %568, %567, %566) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %862 = mhlo.add %847, %861 : tensor<1x14x14x1024xf32>
     %863 = mhlo.maximum %862, %326 : tensor<1x14x14x1024xf32>
-    %864 = "mhlo.convolution"(%863, %577) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
+    %864 = "mhlo.convolution"(%863, %577) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x256xf32>) -> tensor<1x14x14x256xf32>
     %865 = "mhlo.broadcast_in_dim"(%576) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %866 = mhlo.add %864, %865 : tensor<1x14x14x256xf32>
     %867 = "mhlo.batch_norm_inference"(%866, %575, %574, %573, %572) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %868 = mhlo.maximum %867, %325 : tensor<1x14x14x256xf32>
-    %869 = "mhlo.convolution"(%868, %583) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
+    %869 = "mhlo.convolution"(%868, %583) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<3x3x256x256xf32>) -> tensor<1x14x14x256xf32>
     %870 = "mhlo.broadcast_in_dim"(%582) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %871 = mhlo.add %869, %870 : tensor<1x14x14x256xf32>
     %872 = "mhlo.batch_norm_inference"(%871, %581, %580, %579, %578) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>) -> tensor<1x14x14x256xf32>
     %873 = mhlo.maximum %872, %325 : tensor<1x14x14x256xf32>
-    %874 = "mhlo.convolution"(%873, %589) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
+    %874 = "mhlo.convolution"(%873, %589) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x14x14x256xf32>, tensor<1x1x256x1024xf32>) -> tensor<1x14x14x1024xf32>
     %875 = "mhlo.broadcast_in_dim"(%588) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %876 = mhlo.add %874, %875 : tensor<1x14x14x1024xf32>
     %877 = "mhlo.batch_norm_inference"(%876, %587, %586, %585, %584) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x14x14x1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<1024xf32>) -> tensor<1x14x14x1024xf32>
     %878 = mhlo.add %863, %877 : tensor<1x14x14x1024xf32>
     %879 = mhlo.maximum %878, %326 : tensor<1x14x14x1024xf32>
-    %880 = "mhlo.convolution"(%879, %595) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x2048xf32>) -> tensor<1x7x7x2048xf32>
+    %880 = "mhlo.convolution"(%879, %595) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x2048xf32>) -> tensor<1x7x7x2048xf32>
     %881 = "mhlo.broadcast_in_dim"(%594) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2048xf32>) -> tensor<1x7x7x2048xf32>
     %882 = mhlo.add %880, %881 : tensor<1x7x7x2048xf32>
     %883 = "mhlo.batch_norm_inference"(%882, %593, %592, %591, %590) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>) -> tensor<1x7x7x2048xf32>
-    %884 = "mhlo.convolution"(%879, %601) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x512xf32>) -> tensor<1x7x7x512xf32>
+    %884 = "mhlo.convolution"(%879, %601) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : (tensor<1x14x14x1024xf32>, tensor<1x1x1024x512xf32>) -> tensor<1x7x7x512xf32>
     %885 = "mhlo.broadcast_in_dim"(%600) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %886 = mhlo.add %884, %885 : tensor<1x7x7x512xf32>
     %887 = "mhlo.batch_norm_inference"(%886, %599, %598, %597, %596) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %888 = mhlo.maximum %887, %327 : tensor<1x7x7x512xf32>
-    %889 = "mhlo.convolution"(%888, %607) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<3x3x512x512xf32>) -> tensor<1x7x7x512xf32>
+    %889 = "mhlo.convolution"(%888, %607) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<3x3x512x512xf32>) -> tensor<1x7x7x512xf32>
     %890 = "mhlo.broadcast_in_dim"(%606) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %891 = mhlo.add %889, %890 : tensor<1x7x7x512xf32>
     %892 = "mhlo.batch_norm_inference"(%891, %605, %604, %603, %602) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %893 = mhlo.maximum %892, %327 : tensor<1x7x7x512xf32>
-    %894 = "mhlo.convolution"(%893, %613) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<1x1x512x2048xf32>) -> tensor<1x7x7x2048xf32>
+    %894 = "mhlo.convolution"(%893, %613) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<1x1x512x2048xf32>) -> tensor<1x7x7x2048xf32>
     %895 = "mhlo.broadcast_in_dim"(%612) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2048xf32>) -> tensor<1x7x7x2048xf32>
     %896 = mhlo.add %894, %895 : tensor<1x7x7x2048xf32>
     %897 = "mhlo.batch_norm_inference"(%896, %611, %610, %609, %608) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>) -> tensor<1x7x7x2048xf32>
     %898 = mhlo.add %883, %897 : tensor<1x7x7x2048xf32>
     %899 = mhlo.maximum %898, %328 : tensor<1x7x7x2048xf32>
-    %900 = "mhlo.convolution"(%899, %619) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x2048xf32>, tensor<1x1x2048x512xf32>) -> tensor<1x7x7x512xf32>
+    %900 = "mhlo.convolution"(%899, %619) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x2048xf32>, tensor<1x1x2048x512xf32>) -> tensor<1x7x7x512xf32>
     %901 = "mhlo.broadcast_in_dim"(%618) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %902 = mhlo.add %900, %901 : tensor<1x7x7x512xf32>
     %903 = "mhlo.batch_norm_inference"(%902, %617, %616, %615, %614) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %904 = mhlo.maximum %903, %327 : tensor<1x7x7x512xf32>
-    %905 = "mhlo.convolution"(%904, %625) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<3x3x512x512xf32>) -> tensor<1x7x7x512xf32>
+    %905 = "mhlo.convolution"(%904, %625) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<3x3x512x512xf32>) -> tensor<1x7x7x512xf32>
     %906 = "mhlo.broadcast_in_dim"(%624) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %907 = mhlo.add %905, %906 : tensor<1x7x7x512xf32>
     %908 = "mhlo.batch_norm_inference"(%907, %623, %622, %621, %620) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %909 = mhlo.maximum %908, %327 : tensor<1x7x7x512xf32>
-    %910 = "mhlo.convolution"(%909, %631) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<1x1x512x2048xf32>) -> tensor<1x7x7x2048xf32>
+    %910 = "mhlo.convolution"(%909, %631) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<1x1x512x2048xf32>) -> tensor<1x7x7x2048xf32>
     %911 = "mhlo.broadcast_in_dim"(%630) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2048xf32>) -> tensor<1x7x7x2048xf32>
     %912 = mhlo.add %910, %911 : tensor<1x7x7x2048xf32>
     %913 = "mhlo.batch_norm_inference"(%912, %629, %628, %627, %626) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>) -> tensor<1x7x7x2048xf32>
     %914 = mhlo.add %899, %913 : tensor<1x7x7x2048xf32>
     %915 = mhlo.maximum %914, %328 : tensor<1x7x7x2048xf32>
-    %916 = "mhlo.convolution"(%915, %637) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x2048xf32>, tensor<1x1x2048x512xf32>) -> tensor<1x7x7x512xf32>
+    %916 = "mhlo.convolution"(%915, %637) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x2048xf32>, tensor<1x1x2048x512xf32>) -> tensor<1x7x7x512xf32>
     %917 = "mhlo.broadcast_in_dim"(%636) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %918 = mhlo.add %916, %917 : tensor<1x7x7x512xf32>
     %919 = "mhlo.batch_norm_inference"(%918, %635, %634, %633, %632) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %920 = mhlo.maximum %919, %327 : tensor<1x7x7x512xf32>
-    %921 = "mhlo.convolution"(%920, %643) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<3x3x512x512xf32>) -> tensor<1x7x7x512xf32>
+    %921 = "mhlo.convolution"(%920, %643) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<3x3x512x512xf32>) -> tensor<1x7x7x512xf32>
     %922 = "mhlo.broadcast_in_dim"(%642) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %923 = mhlo.add %921, %922 : tensor<1x7x7x512xf32>
     %924 = "mhlo.batch_norm_inference"(%923, %641, %640, %639, %638) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>) -> tensor<1x7x7x512xf32>
     %925 = mhlo.maximum %924, %327 : tensor<1x7x7x512xf32>
-    %926 = "mhlo.convolution"(%925, %649) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<1x1x512x2048xf32>) -> tensor<1x7x7x2048xf32>
+    %926 = "mhlo.convolution"(%925, %649) {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<raw input_batch_dimension = 0, input_feature_dimension = 3, input_spatial_dimensions = [1, 2], kernel_input_feature_dimension = 2, kernel_output_feature_dimension = 3, kernel_spatial_dimensions = [0, 1], output_batch_dimension = 0, output_feature_dimension = 3, output_spatial_dimensions = [1, 2]>, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x7x7x512xf32>, tensor<1x1x512x2048xf32>) -> tensor<1x7x7x2048xf32>
     %927 = "mhlo.broadcast_in_dim"(%648) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<2048xf32>) -> tensor<1x7x7x2048xf32>
     %928 = mhlo.add %926, %927 : tensor<1x7x7x2048xf32>
     %929 = "mhlo.batch_norm_inference"(%928, %647, %646, %645, %644) {epsilon = 1.001000e-05 : f32, feature_index = 3 : i64} : (tensor<1x7x7x2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>, tensor<2048xf32>) -> tensor<1x7x7x2048xf32>
diff --git a/iree/test/e2e/vulkan_specific/conv.mlir b/iree/test/e2e/vulkan_specific/conv.mlir
index 13f562f..8ae0422 100644
--- a/iree/test/e2e/vulkan_specific/conv.mlir
+++ b/iree/test/e2e/vulkan_specific/conv.mlir
@@ -49,16 +49,17 @@
    : tensor<2x3x2x3xf32>
   %2 = "mhlo.convolution"(%0, %1) {
        batch_group_count = 1 : i64,
-       dimension_numbers = {
-         input_batch_dimension = 0 : i64,
-   input_feature_dimension = 3 : i64,
-   input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-   kernel_input_feature_dimension = 2 : i64,
-   kernel_output_feature_dimension = 3 : i64,
-   kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-   output_batch_dimension = 0 : i64,
-   output_feature_dimension = 3 : i64,
-   output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+       dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
        feature_group_count = 1 : i64,
        rhs_dilation = dense<1> : tensor<2xi64>,
        window_strides = dense<1> : tensor<2xi64>}
diff --git a/iree/test/e2e/vulkan_specific/vectorized_conv.mlir b/iree/test/e2e/vulkan_specific/vectorized_conv.mlir
index ab6a290..b0dfabf 100644
--- a/iree/test/e2e/vulkan_specific/vectorized_conv.mlir
+++ b/iree/test/e2e/vulkan_specific/vectorized_conv.mlir
@@ -45,7 +45,18 @@
           1.0, 1.0, 2.5, 3.0, 2.0, 1.0, 1.0, 0.5, 0.0, 4.5, 0.0, 1.0, 4.0, 1.5, 5.0, 0.0]]]]>
     : tensor<2x2x4x32xf32>
 
-    %0 = "mhlo.convolution"(%input, %filter) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x3x3x4xf32>, tensor<2x2x4x32xf32>) -> tensor<1x2x2x32xf32>
+    %0 = "mhlo.convolution"(%input, %filter) {batch_group_count = 1 : i64,
+      dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 3,
+      input_spatial_dimensions = [1, 2],
+      kernel_input_feature_dimension = 2,
+      kernel_output_feature_dimension = 3,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 0,
+      output_feature_dimension = 3,
+      output_spatial_dimensions = [1, 2]
+    >, feature_group_count = 1 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x3x3x4xf32>, tensor<2x2x4x32xf32>) -> tensor<1x2x2x32xf32>
 
    check.expect_almost_eq_const(%0, dense<
      [[[[113.25, 127.0, 198.0, 173.25, 159.5, 190.75, 135.5, 160.0,
@@ -79,7 +90,18 @@
     [[[[2.0, 2.0, 4.0, 2.0, 1.5, 5.0, 3.5, 2.5, 2.5, 0.0, 0.5, 2.5, 4.5, 1.5, 0.0, 2.5]]]]>
     : tensor<1x1x1x16xf32>
 
-    %0 = "mhlo.convolution"(%input, %filter) {batch_group_count = 1 : i64, dimension_numbers = {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>}, feature_group_count = 16 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x4x16xf32>, tensor<1x1x1x16xf32>) -> tensor<1x1x4x16xf32>
+    %0 = "mhlo.convolution"(%input, %filter) {batch_group_count = 1 : i64,
+      dimension_numbers = #mhlo.conv<raw
+        input_batch_dimension = 0,
+        input_feature_dimension = 3,
+        input_spatial_dimensions = [1, 2],
+        kernel_input_feature_dimension = 2,
+        kernel_output_feature_dimension = 3,
+        kernel_spatial_dimensions = [0, 1],
+        output_batch_dimension = 0,
+        output_feature_dimension = 3,
+        output_spatial_dimensions = [1, 2]
+      >, feature_group_count = 16 : i64, padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : (tensor<1x1x4x16xf32>, tensor<1x1x1x16xf32>) -> tensor<1x1x4x16xf32>
 
    check.expect_almost_eq_const(%0, dense<
      [[[[12.0, 15.0, 0.0, 3.0, 2.25, 17.5, 15.75, 5.0, 7.5, 0.0, 0.25, 7.5, 15.75, 10.5, 0.0, 16.25],
diff --git a/iree/test/e2e/xla_ops/convolution.mlir b/iree/test/e2e/xla_ops/convolution.mlir
index 8472efd..78f26f6 100644
--- a/iree/test/e2e/xla_ops/convolution.mlir
+++ b/iree/test/e2e/xla_ops/convolution.mlir
@@ -10,16 +10,17 @@
       [[[ 9.0], [10.0]], [[11.0], [12.0]]]]> : tensor<3x2x2x1xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
         batch_group_count = 1 : i64,
-        dimension_numbers = {
-          input_batch_dimension = 0 : i64,
-          input_feature_dimension = 3 : i64,
-          input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-          kernel_input_feature_dimension = 2 : i64,
-          kernel_output_feature_dimension = 3 : i64,
-          kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-          output_batch_dimension = 0 : i64,
-          output_feature_dimension = 3 : i64,
-          output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+        dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
         feature_group_count = 1 : i64,
         rhs_dilation = dense<1> : tensor<2xi64>,
         window_strides = dense<1> : tensor<2xi64>} : (tensor<1x4x4x2xf32>, tensor<3x2x2x1xf32>) -> tensor<1x2x3x1xf32>
@@ -47,16 +48,17 @@
       [[[ 9.0], [10.0]], [[11.0], [12.0]]]]> : tensor<3x2x2x1xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
         batch_group_count = 1 : i64,
-        dimension_numbers = {
-          input_batch_dimension = 3 : i64,
-          input_feature_dimension = 0 : i64,
-          input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-          kernel_input_feature_dimension = 2 : i64,
-          kernel_output_feature_dimension = 3 : i64,
-          kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-          output_batch_dimension = 0 : i64,
-          output_feature_dimension = 3 : i64,
-          output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+        dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 3,
+          input_feature_dimension = 0,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
         feature_group_count = 1 : i64,
         rhs_dilation = dense<1> : tensor<2xi64>,
         window_strides = dense<1> : tensor<2xi64>} : (tensor<2x4x4x1xf32>, tensor<3x2x2x1xf32>) -> tensor<1x2x3x1xf32>
@@ -79,16 +81,17 @@
       [[[ 9.0], [10.0]], [[11.0], [12.0]]]]> : tensor<3x2x2x1xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
         batch_group_count = 1 : i64,
-        dimension_numbers = {
-          input_batch_dimension = 0 : i64,
-          input_feature_dimension = 3 : i64,
-          input_spatial_dimensions = dense<[2, 1]> : tensor<2xi64>,
-          kernel_input_feature_dimension = 2 : i64,
-          kernel_output_feature_dimension = 3 : i64,
-          kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-          output_batch_dimension = 0 : i64,
-          output_feature_dimension = 3 : i64,
-          output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+        dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [2, 1],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
         feature_group_count = 1 : i64,
         rhs_dilation = dense<1> : tensor<2xi64>,
         window_strides = dense<1> : tensor<2xi64>} : (tensor<1x4x4x2xf32>, tensor<3x2x2x1xf32>) -> tensor<1x2x3x1xf32>
@@ -111,16 +114,17 @@
         [[ 9.0, 11.0], [10.0, 12.0]]]]> : tensor<1x3x2x2xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
         batch_group_count = 1 : i64,
-        dimension_numbers = {
-          input_batch_dimension = 0 : i64,
-          input_feature_dimension = 3 : i64,
-          input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-          kernel_input_feature_dimension = 2 : i64,
-          kernel_output_feature_dimension = 0 : i64,
-          kernel_spatial_dimensions = dense<[1, 3]> : tensor<2xi64>,
-          output_batch_dimension = 0 : i64,
-          output_feature_dimension = 3 : i64,
-          output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+        dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 0,
+          kernel_spatial_dimensions = [1, 3],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
         feature_group_count = 1 : i64,
         rhs_dilation = dense<1> : tensor<2xi64>,
         window_strides = dense<1> : tensor<2xi64>} : (tensor<1x4x4x2xf32>, tensor<1x3x2x2xf32>) -> tensor<1x2x3x1xf32>
@@ -143,16 +147,17 @@
       [[[ 9.0], [10.0]], [[11.0], [12.0]]]]> : tensor<3x2x2x1xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
         batch_group_count = 1 : i64,
-        dimension_numbers = {
-          input_batch_dimension = 0 : i64,
-          input_feature_dimension = 3 : i64,
-          input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-          kernel_input_feature_dimension = 2 : i64,
-          kernel_output_feature_dimension = 3 : i64,
-          kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-          output_batch_dimension = 2 : i64,
-          output_feature_dimension = 0 : i64,
-          output_spatial_dimensions = dense<[3, 1]> : tensor<2xi64>},
+        dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 2,
+          output_feature_dimension = 0,
+          output_spatial_dimensions = [3, 1]
+        >,
         feature_group_count = 1 : i64,
         rhs_dilation = dense<1> : tensor<2xi64>,
         window_strides = dense<1> : tensor<2xi64>} : (tensor<1x4x4x2xf32>, tensor<3x2x2x1xf32>) -> tensor<1x3x1x2xf32>
@@ -176,16 +181,17 @@
       [[[ 9.0], [10.0]], [[11.0], [12.0]]]]> : tensor<3x2x2x1xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
        batch_group_count = 1 : i64,
-       dimension_numbers = {
-         input_batch_dimension = 0 : i64,
-         input_feature_dimension = 3 : i64,
-         input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-         kernel_input_feature_dimension = 2 : i64,
-         kernel_output_feature_dimension = 3 : i64,
-         kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-         output_batch_dimension = 0 : i64,
-         output_feature_dimension = 3 : i64,
-         output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+       dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
        feature_group_count = 1 : i64,
        padding = dense<[[1, 1], [0, 1]]> : tensor<2x2xi64>,
        rhs_dilation = dense<1> : tensor<2xi64>,
@@ -214,16 +220,17 @@
       [[[4.0]], [[5.0]], [[6.0]]]]> : tensor <2x3x1x1xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
        batch_group_count = 1 : i64,
-       dimension_numbers = {
-         input_batch_dimension = 0 : i64,
-         input_feature_dimension = 3 : i64,
-         input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-         kernel_input_feature_dimension = 2 : i64,
-         kernel_output_feature_dimension = 3 : i64,
-         kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-         output_batch_dimension = 0 : i64,
-         output_feature_dimension = 3 : i64,
-         output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+       dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
        feature_group_count = 1 : i64,
        padding = dense<[[0, 1], [1, 1]]> : tensor<2x2xi64>,
        rhs_dilation = dense<1> : tensor<2xi64>,
@@ -304,16 +311,17 @@
         [103.0, 104.0, 105.0, 106.0, 107.0, 108.0]]]]> : tensor<2x3x3x6xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
        batch_group_count = 1 : i64,
-       dimension_numbers = {
-         input_batch_dimension = 0 : i64,
-         input_feature_dimension = 3 : i64,
-         input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-         kernel_input_feature_dimension = 2 : i64,
-         kernel_output_feature_dimension = 3 : i64,
-         kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-         output_batch_dimension = 0 : i64,
-         output_feature_dimension = 3 : i64,
-         output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
+       dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
        feature_group_count = 1 : i64,
        rhs_dilation = dense<1> : tensor<2xi64>,
        window_strides = dense<1> : tensor<2xi64>} :
@@ -373,17 +381,17 @@
        [-0.7792497,   0.31265917, -0.7236341 ]]]]> : tensor<2x2x2x3xf32>
   %res = "mhlo.convolution"(%inputs, %weights) {
     batch_group_count = 1 : i64,
-    dimension_numbers = {
-      input_batch_dimension = 0 : i64,
-      input_feature_dimension = 3 : i64,
-      input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-      kernel_input_feature_dimension = 2 : i64,
-      kernel_output_feature_dimension = 3 : i64,
-      kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      output_batch_dimension = 0 : i64,
-      output_feature_dimension = 3 : i64,
-      output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
-    },
+    dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
     feature_group_count = 1 : i64,
     padding = dense<0> : tensor<2x2xi64>,
     rhs_dilation = dense<[2, 1]> : tensor<2xi64>,
@@ -406,17 +414,17 @@
   %arg1 = util.unfoldable_constant dense<1.0> : tensor<2x2x2x3xf32>
   %res = "mhlo.convolution"(%arg0, %arg1) {
     batch_group_count = 1 : i64,
-    dimension_numbers = {
-      input_batch_dimension = 0 : i64,
-      input_feature_dimension = 3 : i64,
-      input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-      kernel_input_feature_dimension = 2 : i64,
-      kernel_output_feature_dimension = 3 : i64,
-      kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      output_batch_dimension = 0 : i64,
-      output_feature_dimension = 3 : i64,
-      output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
-    },
+    dimension_numbers = #mhlo.conv<raw
+          input_batch_dimension = 0,
+          input_feature_dimension = 3,
+          input_spatial_dimensions = [1, 2],
+          kernel_input_feature_dimension = 2,
+          kernel_output_feature_dimension = 3,
+          kernel_spatial_dimensions = [0, 1],
+          output_batch_dimension = 0,
+          output_feature_dimension = 3,
+          output_spatial_dimensions = [1, 2]
+        >,
     feature_group_count = 2 : i64,
     padding = dense<0> : tensor<2x2xi64>,
     rhs_dilation = dense<1> : tensor<2xi64>,
diff --git a/llvm-external-projects/iree-dialects/BUILD b/llvm-external-projects/iree-dialects/BUILD
index ff9cbdf..90340be 100644
--- a/llvm-external-projects/iree-dialects/BUILD
+++ b/llvm-external-projects/iree-dialects/BUILD
@@ -275,6 +275,7 @@
 cc_library(
     name = "IREEPyDMTransforms",
     srcs = glob([
+        "lib/Dialect/IREEPyDM/Transforms/*.cpp",
         "lib/Dialect/IREEPyDM/Transforms/RTL/*.cpp",
         "lib/Dialect/IREEPyDM/Transforms/ToIREE/*.cpp",
     ]),
@@ -291,6 +292,7 @@
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 5f7a535..471b25e 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 5f7a5353301b776ffb0e5fb048992898507bf7ee
+Subproject commit 471b25e217e635e058bbdbca8c693e2998380a60