Merge google -> main #7136

2e8546d Synchronize submodules with LLVM at llvm/llvm-project@f5b8f12
29317b6 Merge pull request Merge main -> google #7131 from rsuderman:main-to-google
a317e2c Move quantized type stripping to a separate pass.
87e6ba5 Migrate ScatterDimensionNumbers attribute definition from StructAttr to be a..
1d77498 Synchronize submodules with LLVM at llvm/llvm-project@f5b8f12
diff --git a/iree/compiler/Codegen/LLVMCPU/BUILD b/iree/compiler/Codegen/LLVMCPU/BUILD
index a71024b..f9e7750 100644
--- a/iree/compiler/Codegen/LLVMCPU/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/BUILD
@@ -16,7 +16,6 @@
         "ConvertToLLVM.cpp",
         "KernelDispatch.cpp",
         "LLVMCPULowerExecutableTarget.cpp",
-        "LLVMCPUPadWorkgroupTiles.cpp",
         "LLVMCPUPlanConvLoopOrder.cpp",
         "LLVMCPUSynchronizeSymbolVisibility.cpp",
         "LLVMCPUTileAndVectorizeLinalgTensorOps.cpp",
diff --git a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
index fb5643a..72597b1 100644
--- a/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -19,7 +19,6 @@
     "ConvertToLLVM.cpp"
     "KernelDispatch.cpp"
     "LLVMCPULowerExecutableTarget.cpp"
-    "LLVMCPUPadWorkgroupTiles.cpp"
     "LLVMCPUPlanConvLoopOrder.cpp"
     "LLVMCPUSynchronizeSymbolVisibility.cpp"
     "LLVMCPUTileAndVectorizeLinalgTensorOps.cpp"
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUPadWorkgroupTiles.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUPadWorkgroupTiles.cpp
deleted file mode 100644
index 3f11b5e..0000000
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUPadWorkgroupTiles.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright 2021 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
-#include "iree/compiler/Codegen/PassDetail.h"
-#include "iree/compiler/Codegen/Passes.h"
-#include "iree/compiler/Codegen/Utils/Utils.h"
-#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
-#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define DEBUG_TYPE "iree-llvmcpu-pad-workgroup-tiles"
-
-namespace mlir {
-namespace iree_compiler {
-
-namespace {
-
-// Creates linalg.pad_tensor op with constant padding.
-static linalg::PadTensorOp createPadTensorOpWithStaticPadding(
-    PatternRewriter &rewriter, mlir::Location loc, Type resultType, Value input,
-    Value padding, ArrayRef<int64_t> lowPadding,
-    ArrayRef<int64_t> highPadding) {
-  auto padTensorOp = rewriter.create<linalg::PadTensorOp>(
-      loc, resultType, input, ArrayRef<Value>{}, ArrayRef<Value>{},
-      rewriter.getI64ArrayAttr(lowPadding),
-      rewriter.getI64ArrayAttr(highPadding));
-
-  int rank = padTensorOp.getResultType().getRank();
-  SmallVector<Type, 4> blockArgTypes;
-  blockArgTypes.assign(rank, rewriter.getIndexType());
-  auto &region = padTensorOp.region();
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.createBlock(&region, region.end(), blockArgTypes);
-  rewriter.create<linalg::YieldOp>(loc, padding);
-  return padTensorOp;
-}
-
-// Returns padding for dim to next integer multiple of the vector size.
-static std::pair<int, int> getVectorPaddingSize(int dim, int vecSize) {
-  if (dim < vecSize) {
-    int size = vecSize;
-    int padValue = size - dim;
-    return {vecSize, padValue};
-  } else {
-    int size = ceil((float)dim / (float)vecSize) * vecSize;
-    int padValue = size - dim;
-    return {size, padValue};
-  }
-}
-
-// Returns padding for dim to next integer multiple of the workgroup size.
-// Note: KernelDispatch gurantess workgroup size is the largest integer
-// multiple of the vector size.
-std::pair<int, int> getWorkgroupPaddedTileSize(int dim, int workgoupSize,
-                                               int vecSize) {
-  if (dim > workgoupSize) {
-    int size = ceil((float)dim / (float)workgoupSize) * workgoupSize;
-    int padValue = size - dim;
-    return {workgoupSize, padValue};
-  } else {
-    return getVectorPaddingSize(dim, vecSize);
-  }
-}
-
-// Creates linalg.matmul with operands padded to the next integer multiple of
-// the workgroup size.
-class MatmulWorkgroupTilesPadding : public OpRewritePattern<linalg::MatmulOp> {
- public:
-  using OpRewritePattern<linalg::MatmulOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(linalg::MatmulOp matmulOp,
-                                PatternRewriter &rewriter) const override {
-    if (!getLoweringConfig(matmulOp)) return failure();
-    auto loc = matmulOp.getLoc();
-    auto lhs = matmulOp.getInputOperand(0)->get();
-    auto rhs = matmulOp.getInputOperand(1)->get();
-    auto result = matmulOp.getOutputOperand(0)->get();
-
-    if (lhs.getDefiningOp<linalg::PadTensorOp>() ||
-        rhs.getDefiningOp<linalg::PadTensorOp>())
-      return failure();
-
-    auto workgroupTileSizes = getTileSizes(
-        matmulOp, static_cast<unsigned>(TilingLevel::WorkGroupTiles));
-    auto vectorTileSizes =
-        getTileSizes(matmulOp, static_cast<unsigned>(TilingLevel::Level2Tiles));
-    if (workgroupTileSizes.empty() || vectorTileSizes.empty()) return failure();
-
-    auto lhsFullSize = getUntiledShape(lhs);
-    auto rhsFullSize = getUntiledShape(rhs);
-    if (lhsFullSize.empty() || rhsFullSize.empty()) return failure();
-
-    int problemSizeM = lhsFullSize[0];
-    int problemSizeN = rhsFullSize[1];
-    int problemSizeK = lhsFullSize[1];
-
-    int paddedMSize, paddedNSize, paddedKSize;
-    int paddingForM, paddingForN, paddingForK;
-    std::tie(paddedMSize, paddingForM) = getWorkgroupPaddedTileSize(
-        problemSizeM, workgroupTileSizes[0], vectorTileSizes[0]);
-    std::tie(paddedNSize, paddingForN) = getWorkgroupPaddedTileSize(
-        problemSizeN, workgroupTileSizes[1], vectorTileSizes[1]);
-    std::tie(paddedKSize, paddingForK) =
-        getVectorPaddingSize(problemSizeK, vectorTileSizes[2]);
-
-    // No padding.
-    if (paddingForM == 0 && paddingForN == 0 && paddingForK == 0)
-      return failure();
-
-    DEBUG_WITH_TYPE(DEBUG_TYPE, {
-      auto l1TileSizes = getTileSizes(
-          matmulOp, static_cast<unsigned>(TilingLevel::Level1Tiles));
-      llvm::dbgs() << "Problem-size: "
-                   << "[" << problemSizeM << "," << problemSizeK << "]"
-                   << ", "
-                   << "[" << problemSizeK << "," << problemSizeN << "]\n";
-      llvm::dbgs() << "Workgroup-tile-sizes:"
-                   << "[" << workgroupTileSizes[0] << ", "
-                   << workgroupTileSizes[1] << "]\n";
-      llvm::dbgs() << "L1-tile-sizes:"
-                   << "[" << l1TileSizes[0] << ", " << l1TileSizes[1] << ","
-                   << l1TileSizes[2] << "]\n";
-      llvm::dbgs() << "Vector-tile-sizes:"
-                   << "[" << vectorTileSizes[0] << ", " << vectorTileSizes[1]
-                   << ", " << vectorTileSizes[2] << "]\n";
-      auto lhsStackSize = paddedMSize * paddedKSize * 4;
-      auto rhsStackSize = paddedKSize * paddedNSize * 4;
-      auto outputStackSize = paddedMSize * paddedNSize * 4;
-      llvm::dbgs() << "LHS after padding:"
-                   << "[" << paddedMSize << "," << paddedKSize
-                   << "], size_in_bytes = " << lhsStackSize << "\n";
-      llvm::dbgs() << "RHS after padding:"
-                   << "[" << paddedKSize << "," << paddedNSize
-                   << "], size_in_bytes = " << rhsStackSize << "\n";
-      llvm::dbgs() << "Result after padding:"
-                   << "[" << paddedMSize << "," << paddedNSize
-                   << "], size_in_bytes = " << outputStackSize << "\n";
-    });
-
-    auto getPaddedOperand = [&](Value operand, ArrayRef<int64_t> shape,
-                                ArrayRef<int64_t> highPadding) -> Value {
-      if (llvm::all_of(highPadding,
-                       [](int64_t val) -> bool { return val == 0; })) {
-        return operand;
-      }
-      auto elementType =
-          operand.getType().cast<RankedTensorType>().getElementType();
-      auto paddedType = RankedTensorType::get(shape, elementType);
-      auto paddingValue =
-          rewriter.create<ConstantOp>(loc, rewriter.getZeroAttr(elementType));
-      auto paddedOperand =
-          createPadTensorOpWithStaticPadding(rewriter, loc, paddedType, operand,
-                                             paddingValue, {0, 0}, highPadding);
-      return paddedOperand;
-    };
-
-    auto paddedLhs = getPaddedOperand(lhs, {paddedMSize, paddedKSize},
-                                      {paddingForM, paddingForK});
-
-    auto paddedrhs = getPaddedOperand(rhs, {paddedKSize, paddedNSize},
-                                      {paddingForK, paddingForN});
-
-    auto resultType = RankedTensorType::get(
-        {paddedMSize, paddedNSize},
-        result.getType().cast<RankedTensorType>().getElementType());
-
-    // Padding for K-dim only result doesn't change result size.
-    if (paddingForM == 0 && paddingForN == 0) {
-      auto paddedMatmulOp =
-          cast<linalg::LinalgOp>(matmulOp.getOperation())
-              .clone(rewriter, loc, {resultType},
-                     ArrayRef<Value>{paddedLhs, paddedrhs, result});
-      rewriter.replaceOp(matmulOp, paddedMatmulOp->getResults());
-    } else {
-      // Padding eather M or N requires changing the result size.
-      auto getActualSizes = [](Value operand) {
-        auto defOp = operand.getDefiningOp<IREE::Flow::DispatchTensorLoadOp>();
-        return defOp.sizes();
-      };
-      // Get the actual output tile size (before padding).
-      auto lhsSizes = getActualSizes(lhs);
-      auto rhsSizes = getActualSizes(rhs);
-      SmallVector<OpFoldResult> sizes;
-      if (lhsSizes.empty()) {
-        sizes.push_back(rewriter.getIndexAttr(paddedMSize));
-      } else {
-        sizes.push_back(lhsSizes.front());
-      }
-      if (rhsSizes.empty()) {
-        sizes.push_back(rewriter.getIndexAttr(paddedNSize));
-      } else {
-        sizes.push_back(rhsSizes.back());
-      }
-      auto elementType = matmulOp.getResults()[0]
-                             .getType()
-                             .cast<ShapedType>()
-                             .getElementType();
-      auto staticResult = rewriter.create<linalg::InitTensorOp>(
-          loc, ArrayRef<int64_t>{paddedMSize, paddedNSize}, elementType);
-
-      Value zero =
-          rewriter.create<ConstantOp>(loc, rewriter.getZeroAttr(elementType));
-      auto filledStaticResult =
-          rewriter.create<linalg::FillOp>(loc, zero, staticResult);
-      auto paddedMatmulOp =
-          cast<linalg::LinalgOp>(matmulOp.getOperation())
-              .clone(rewriter, loc, {resultType},
-                     ArrayRef<Value>{paddedLhs, paddedrhs,
-                                     filledStaticResult.result()});
-      SmallVector<OpFoldResult> offsets(2, rewriter.getI64IntegerAttr(0));
-      SmallVector<OpFoldResult> strides(2, rewriter.getI64IntegerAttr(1));
-      rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
-          matmulOp, paddedMatmulOp->getResults()[0], offsets, sizes, strides);
-    }
-    return success();
-  }
-};
-
-struct LLVMCPUPadWorkgroupTilesPass
-    : LLVMCPUPadWorkgroupTilesBase<LLVMCPUPadWorkgroupTilesPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect>();
-  }
-  void runOnOperation() override {
-    MLIRContext *context = &getContext();
-    OwningRewritePatternList patterns(&getContext());
-    patterns.insert<MatmulWorkgroupTilesPadding>(context);
-    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
-  }
-};
-}  // namespace
-
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUPadWorkgroupTilesPass() {
-  return std::make_unique<LLVMCPUPadWorkgroupTilesPass>();
-}
-
-}  // namespace iree_compiler
-}  // namespace mlir
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index a5631f2..0327a2f 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -24,7 +24,6 @@
             "hal_interface_workgroup_info.mlir",
             "materialize_launch_configuration.mlir",
             "matmul_vectorization.mlir",
-            "pad_workgroup_tiles.mlir",
             "plan_conv_loop_order.mlir",
             "synchronize_symbol_visibility.mlir",
             "tile_and_vectorize.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index 3c94a39..759e6d4 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -19,7 +19,6 @@
     "hal_interface_workgroup_info.mlir"
     "materialize_launch_configuration.mlir"
     "matmul_vectorization.mlir"
-    "pad_workgroup_tiles.mlir"
     "plan_conv_loop_order.mlir"
     "synchronize_symbol_visibility.mlir"
     "tile_and_vectorize.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/pad_workgroup_tiles.mlir b/iree/compiler/Codegen/LLVMCPU/test/pad_workgroup_tiles.mlir
deleted file mode 100644
index e601390..0000000
--- a/iree/compiler/Codegen/LLVMCPU/test/pad_workgroup_tiles.mlir
+++ /dev/null
@@ -1,66 +0,0 @@
-// RUN: iree-opt %s -cse -iree-llvmcpu-pad-workgroup-tiles -split-input-file | IreeFileCheck %s
-
-#config0 = {tileSizes = [[64, 64]]}
-#config1 = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 32], [4, 4, 4]]}
-module  {
-  func @matmul_f32_5x3x5() {
-    %c0 = constant 0 : index
-    %cst = constant 0.000000e+00 : f32
-    %c5 = constant 5 : index
-    %c64 = constant 64 : index
-    %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:5x3xf32>
-    %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:3x5xf32>
-    %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:5x5xf32>
-    %workgroup_id_x = hal.interface.workgroup.id[0] : index
-    %workgroup_count_x = hal.interface.workgroup.count[0] : index
-    %workgroup_id_y = hal.interface.workgroup.id[1] : index
-    %workgroup_count_y = hal.interface.workgroup.count[1] : index
-    %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %c64]
-    %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %c64]
-    scf.for %arg0 = %3 to %c5 step %4 {
-      %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %c64]
-      %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %c64]
-      scf.for %arg1 = %5 to %c5 step %6 {
-        %7 = affine.min affine_map<(d0) -> (64, -d0 + 5)>(%arg0)
-        %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x3xf32> -> tensor<?x3xf32>
-        %9 = affine.min affine_map<(d0) -> (64, -d0 + 5)>(%arg1)
-        %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [3, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:3x5xf32> -> tensor<3x?xf32>
-        %11 = affine.min affine_map<(d0) -> (64, -d0 + 5)>(%arg0)
-        %12 = affine.min affine_map<(d0) -> (64, -d0 + 5)>(%arg1)
-        %13 = affine.min affine_map<(d0) -> (-d0 + 5, 64)>(%arg0)
-        %14 = affine.min affine_map<(d0) -> (-d0 + 5, 64)>(%arg1)
-        %15 = linalg.init_tensor [%13, %14] : tensor<?x?xf32>
-        %16 = linalg.fill(%cst, %15) {__internal_linalg_transform__ = "workgroup", lowering.config = #config0} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
-        %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup", lowering.config = #config1} ins(%8, %10 : tensor<?x3xf32>, tensor<3x?xf32>) outs(%16 : tensor<?x?xf32>) -> tensor<?x?xf32>
-        flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:5x5xf32>
-      }
-    }
-    return
-  }
-
-  hal.interface private @io  {
-    hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
-    hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
-    hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-  }
-}
-// CHECK-LABEL: @matmul_f32_5x3x5
-//       CHECK: %[[C0:.+]] = constant 0.000000e+00 : f32
-//       CHECK: %[[LHS:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%{{.*}}] : !flow.dispatch.tensor<readonly:5x3xf32>
-//       CHECK: %[[RHS:.+]] = hal.interface.binding.subspan @io::@s0b1_ro_external[%{{.*}}] : !flow.dispatch.tensor<readonly:3x5xf32>
-//       CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%{{.*}}] : !flow.dispatch.tensor<writeonly:5x5xf32>
-//       CHECK: flow.dispatch.tensor.load %[[LHS]], offsets = [%{{.*}}, 0], sizes = [%[[LHS_TILE_SIZE:.+]], 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:5x3xf32> -> tensor<?x3xf32>
-//       CHECK: flow.dispatch.tensor.load %[[RHS]], offsets = [0, %{{.*}}], sizes = [3, %[[RHS_TILE_SIZE:.+]]], strides = [1, 1] : !flow.dispatch.tensor<readonly:3x5xf32> -> tensor<3x?xf32>
-//       CHECK: %[[PADDED_LHS:.+]] = linalg.pad_tensor %{{.*}} low[0, 0] high[3, 1]  {
-//  CHECK-NEXT:     ^bb0(%{{.*}}: index, %{{.*}}: index):  // no predecessors
-//  CHECK-NEXT: linalg.yield %[[C0]] : f32
-//  CHECK-NEXT: tensor<?x3xf32> to tensor<8x4xf32>
-//       CHECK: %[[PADDED_RHS:.+]] = linalg.pad_tensor %{{.*}} low[0, 0] high[1, 3]  {
-//  CHECK-NEXT:     ^bb0(%{{.*}}: index, %{{.*}}: index):  // no predecessors
-//  CHECK-NEXT: linalg.yield %[[C0]] : f32
-//  CHECK-NEXT: tensor<3x?xf32> to tensor<4x8xf32>
-//       CHECK: %[[PADDED_RESULT:.+]] = linalg.init_tensor [8, 8] : tensor<8x8xf32>
-//       CHECK: %[[PADDED_RESULT_0:.+]] = linalg.fill(%[[C0]], %[[PADDED_RESULT]]) : f32, tensor<8x8xf32>
-//       CHECK: %[[MATMUL_RESULT:.+]] = linalg.matmul {{.*}} ins(%[[PADDED_LHS]], %[[PADDED_RHS]] : tensor<8x4xf32>, tensor<4x8xf32>) outs(%[[PADDED_RESULT_0]] : tensor<8x8xf32>) -> tensor<8x8xf32>
-//       CHECK: %[[CLIPED_RESULT:.+]] = tensor.extract_slice %[[MATMUL_RESULT]][0, 0] [%[[LHS_TILE_SIZE]], %[[RHS_TILE_SIZE]]] [1, 1] : tensor<8x8xf32> to tensor<?x?xf32>
-//       CHECK:  flow.dispatch.tensor.store %[[CLIPED_RESULT]], %[[RESULT]], offsets = [%{{.*}}, %{{.*}}], sizes = [%[[LHS_TILE_SIZE]], %[[RHS_TILE_SIZE]]], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:5x5xf32>
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index 5417c3a..8c86c1d 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h
@@ -116,10 +116,6 @@
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>
 createLLVMCPULowerExecutableTargetPass(bool lowerToVectors = true);
 
-/// Pad linalg ops workgroup tiles into the next integer multiple of the target
-/// vector size.
-std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUPadWorkgroupTilesPass();
-
 /// Converts linalg.conv into linalg.generic with a CPU-friendly iteration
 /// order.
 std::unique_ptr<OperationPass<FuncOp>> createLLVMCPUPlanConvLoopOrderPass();
diff --git a/iree/compiler/Codegen/Passes.td b/iree/compiler/Codegen/Passes.td
index 6233052..c74d2a4 100644
--- a/iree/compiler/Codegen/Passes.td
+++ b/iree/compiler/Codegen/Passes.td
@@ -103,13 +103,6 @@
       "mlir::iree_compiler::createLLVMCPULowerExecutableTargetPass()";
 }
 
-def LLVMCPUPadWorkgroupTiles :
-    Pass<"iree-llvmcpu-pad-workgroup-tiles", "FuncOp"> {
-  let summary =
-      "Pad workgroup tiles to an integer multiple of tiling parameters.";
-  let constructor = "mlir::iree_compiler::createLLVMCPUPadWorkgroupTilesPass()";
-}
-
 def LLVMCPUPlanConvLoopOrder :
     Pass<"iree-llvmcpu-plan-conv-loop-order", "FuncOp"> {
   let summary =
diff --git a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index fa57109..cf8c39d 100644
--- a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -208,6 +208,10 @@
 LogicalResult setMatmulOpConfig(linalg::LinalgOp op,
                                 std::array<int64_t, 2> bestWorkgroupSizeXY,
                                 std::array<int64_t, 3> bestThreadTileSizeMNK) {
+  auto lhsType = op.inputs()[0].getType().cast<ShapedType>();
+  auto elementBits = lhsType.getElementType().getIntOrFloatBitWidth();
+  if (elementBits != 16 && elementBits != 32) return success();
+
   ArrayRef<int64_t> lhsShape = getUntiledShape(op.inputs()[0]);
   ArrayRef<int64_t> rhsShape = getUntiledShape(op.inputs()[1]);
   if (llvm::any_of(lhsShape, ShapedType::isDynamic)) return success();
@@ -282,12 +286,13 @@
 
   // Deduce the configuration for the K dimension. We need some power of two
   // here so that we can do vector load.
-  for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 1; t >>= 1) {
+  for (int64_t t = llvm::PowerOf2Floor(residualTilingFactor); t >= 2; t >>= 1) {
     if (dimK % t == 0) {
       workgroupTileSizes[2 + isBM] = invocationTileSizes[2 + isBM] = t;
       break;
     }
   }
+  if (workgroupTileSizes[2 + isBM] == 0) return success();
 
   auto pipeline = IREE::HAL::DispatchLoweringPassPipeline::SPIRVVectorize;
   TileSizesListType tileSizes;
diff --git a/iree/compiler/Codegen/SPIRV/test/BUILD b/iree/compiler/Codegen/SPIRV/test/BUILD
index 5f847d4..75967bc 100644
--- a/iree/compiler/Codegen/SPIRV/test/BUILD
+++ b/iree/compiler/Codegen/SPIRV/test/BUILD
@@ -21,6 +21,7 @@
         [
             "config_adreno_conv.mlir",
             "config_adreno_matmul.mlir",
+            "config_default_matmul.mlir",
             "config_linalg_ext_ops.mlir",
             "config_linalg_ops.mlir",
             "config_mali_conv.mlir",
diff --git a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
index 0d805b1..b42fcef 100644
--- a/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
@@ -16,6 +16,7 @@
   SRCS
     "config_adreno_conv.mlir"
     "config_adreno_matmul.mlir"
+    "config_default_matmul.mlir"
     "config_linalg_ext_ops.mlir"
     "config_linalg_ops.mlir"
     "config_mali_conv.mlir"
diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
new file mode 100644
index 0000000..6cbdd91
--- /dev/null
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir
@@ -0,0 +1,162 @@
+// RUN: iree-opt -split-input-file -mlir-print-local-scope -pass-pipeline='hal.executable(hal.executable.variant(iree-spirv-lower-executable-target-pass{test-lowering-configuration=true}))' %s | IreeFileCheck %s
+
+// Odd K that forbids vectorization.
+
+hal.executable @batch_matmul_1x3x32 {
+  hal.interface public @io {
+    hal.interface.binding public @s0b0_ro_constant, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, {
+        max_compute_shared_memory_size = 16384 : i32,
+        max_compute_workgroup_invocations = 128 : i32,
+        max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>,
+        subgroup_size = 4 : i32}>
+    }> {
+    hal.executable.entry_point public @batch_matmul_1x3x32 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @batch_matmul_1x3x32() {
+        %c0 = constant 0 : index
+        %c32 = constant 32 : index
+        %c3 = constant 3 : index
+        %c1 = constant 1 : index
+        %cst = constant 0.000000e+00 : f32
+        %0 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x3x3xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b0_ro_constant[%c0] : !flow.dispatch.tensor<readonly:1x3x32xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x3x32xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_size_z = hal.interface.workgroup.size[2] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %workgroup_id_z = hal.interface.workgroup.id[2] : index
+        %workgroup_count_z = hal.interface.workgroup.count[2] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
+        scf.for %arg0 = %3 to %c1 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+          scf.for %arg1 = %5 to %c3 step %6 {
+            %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+            %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+            scf.for %arg2 = %7 to %c32 step %8 {
+              %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1)>(%arg0)[%workgroup_size_z]
+              %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg1)[%workgroup_size_y]
+              %11 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%9, %10, 3], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:1x3x3xf32> -> tensor<?x?x3xf32>
+              %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1)>(%arg0)[%workgroup_size_z]
+              %13 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
+              %14 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%12, 3, %13], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:1x3x32xf32> -> tensor<?x3x?xf32>
+              %15 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1)>(%arg0)[%workgroup_size_z]
+              %16 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg1)[%workgroup_size_y]
+              %17 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 32)>(%arg2)[%workgroup_size_x]
+              %18 = affine.min affine_map<(d0)[s0] -> (-d0 + 1, s0)>(%arg0)[%workgroup_size_z]
+              %19 = affine.min affine_map<(d0)[s0] -> (-d0 + 3, s0)>(%arg1)[%workgroup_size_y]
+              %20 = affine.min affine_map<(d0)[s0] -> (-d0 + 32, s0)>(%arg2)[%workgroup_size_x]
+              %21 = linalg.init_tensor [%18, %19, %20] : tensor<?x?x?xf32>
+              %22 = linalg.fill(%cst, %21) : f32, tensor<?x?x?xf32> -> tensor<?x?x?xf32>
+              %23 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%11, %14 : tensor<?x?x3xf32>, tensor<?x3x?xf32>) outs(%22 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+              flow.dispatch.tensor.store %23, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%15, %16, %17], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x3x32xf32>
+            }
+          }
+        }
+        return
+      }
+      hal.interface private @io {
+        hal.interface.binding public @s0b0_ro_constant, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @batch_matmul_1x3x32
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVDistribute", workloadPerWorkgroup = [4, 1, 1]}
+//           CHECK-SAME:   workgroup_size = [4 : index, 1 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index):
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%[[X]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y]], %[[Z]]
+
+//                CHECK: func @batch_matmul_1x3x32()
+//                CHECK:   linalg.batch_matmul
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[1, 1, 4], [], [1, 1, 1]]}
+
+// -----
+
+// Non-16 / non-32 bit types cannot be vectorized right now.
+
+hal.executable private @matmul_64x16 {
+  hal.interface public @io {
+    hal.interface.binding public @s0b0_ro_constant, set=0, binding=0, type="StorageBuffer", access="Read"
+    hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+    hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+  }
+  hal.executable.variant public @vulkan_spirv_fb, target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {
+      spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, {
+        max_compute_shared_memory_size = 16384 : i32,
+        max_compute_workgroup_invocations = 128 : i32,
+        max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>,
+        subgroup_size = 4 : i32}>
+  }> {
+    hal.executable.entry_point public @matmul_64x16 attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @matmul_64x16() {
+        %c0 = constant 0 : index
+        %c16 = constant 16 : index
+        %c64 = constant 64 : index
+        %c0_i32 = constant 0 : i32
+        %0 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:64x32xi8>
+        %1 = hal.interface.binding.subspan @io::@s0b0_ro_constant[%c0] : !flow.dispatch.tensor<readonly:32x16xi8>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:64x16xi32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+        scf.for %arg0 = %3 to %c64 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+          scf.for %arg1 = %5 to %c16 step %6 {
+            %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 64)>(%arg0)[%workgroup_size_y]
+            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 32], strides = [1, 1] : !flow.dispatch.tensor<readonly:64x32xi8> -> tensor<?x32xi8>
+            %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 16)>(%arg1)[%workgroup_size_x]
+            %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [32, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:32x16xi8> -> tensor<32x?xi8>
+            %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 64)>(%arg0)[%workgroup_size_y]
+            %12 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 16)>(%arg1)[%workgroup_size_x]
+            %13 = affine.min affine_map<(d0)[s0] -> (-d0 + 64, s0)>(%arg0)[%workgroup_size_y]
+            %14 = affine.min affine_map<(d0)[s0] -> (-d0 + 16, s0)>(%arg1)[%workgroup_size_x]
+            %15 = linalg.init_tensor [%13, %14] : tensor<?x?xi32>
+            %16 = linalg.fill(%c0_i32, %15) : i32, tensor<?x?xi32> -> tensor<?x?xi32>
+            %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x32xi8>, tensor<32x?xi8>) outs(%16 : tensor<?x?xi32>) -> tensor<?x?xi32>
+            flow.dispatch.tensor.store %17, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:64x16xi32>
+          }
+        }
+        return
+      }
+      hal.interface private @io {
+        hal.interface.binding public @s0b0_ro_constant, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+
+//          CHECK-LABEL: hal.executable.entry_point public @matmul_64x16
+//           CHECK-SAME:   translation.info = {passPipeline = "SPIRVDistribute", workloadPerWorkgroup = [4, 1]}
+//           CHECK-SAME:   workgroup_size = [4 : index, 1 : index, 1 : index]
+//           CHECK-NEXT: ^{{.+}}(%[[X:.+]]: index, %[[Y:.+]]: index, %{{.+}}: index):
+//           CHECK-NEXT:   %[[ONE:.+]] = constant 1 : index
+//           CHECK-NEXT:   %[[X_COUNT:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%[[X]]]
+//           CHECK-NEXT:   hal.return %[[X_COUNT]], %[[Y]], %[[ONE]]
+
+//                CHECK: func @matmul_64x16()
+//                CHECK:   linalg.matmul
+//  CHECK-SAME{LITERAL}:     lowering.config = {tileSizes = [[1, 4], [], [1, 1]]}
diff --git a/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp b/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp
index 657866f..30c7932 100644
--- a/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/DispatchLinalgOnTensors.cpp
@@ -717,6 +717,9 @@
     auto linalgOp = dyn_cast<linalg::LinalgOp>(op);
     if (!linalgOp || !linalgOp.hasTensorSemantics()) return failure();
     if (!hasRootOpAttribute(op)) return failure();
+    if (op->getParentOfType<IREE::Flow::DispatchWorkgroupsOp>()) {
+      return failure();
+    }
 
     // TODO(ravishankarm): It is getting strange to track when to apply this
     // pattern and when not to. Need to revisit this, with dynamic shape cases
@@ -798,6 +801,9 @@
                                 PatternRewriter &rewriter) const override {
     if (!hasRootOpAttribute(tilableOp)) return failure();
     if (hasOnlyDimUses(tilableOp)) return failure();
+    if (tilableOp->getParentOfType<IREE::Flow::DispatchWorkgroupsOp>()) {
+      return failure();
+    }
 
     SmallVector<StringRef> iteratorTypes = tilableOp.getLoopIteratorTypes();
     SmallVector<Range> loopRanges = tilableOp.getLoopBounds(rewriter);
@@ -1079,7 +1085,7 @@
     // order here.
     for (Operation &op : llvm::reverse(block)) {
       // Start with a root operation and fuse its producers.
-      if (!isRootOp(&op)) continue;
+      if (hasFusionGroupsAttribute(&op) || !isRootOp(&op)) continue;
       unsigned newGroup = numRootOps++;
       setRootAttribute(context, &op, newGroup);
 
@@ -1127,6 +1133,11 @@
                 consumerIndexingMap.getResults()) {
           continue;
         }
+        if (llvm::any_of(
+                consumer.getOutputOperands(), [&consumer](OpOperand *operand) {
+                  return !consumer.getTiedIndexingMap(operand).isIdentity();
+                }))
+          continue;
         int64_t rootNumber = getRootNumber(op);
         setRootAttribute(context, user, rootNumber);
         removeRootOpAttribute(op);
diff --git a/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp b/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
index 27b1607..02b3b25 100644
--- a/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/FusionOfTensorOps.cpp
@@ -79,12 +79,35 @@
                           consumer.getOwner()->operand_end());
           if (operands.size() >= kIreeMaxOperandCount) return false;
 
-          llvm::SmallDenseSet<Operation *, 4> numUsers;
-          for (Operation *user : producer.getUsers()) {
-            if (isa<linalg::GenericOp>(user)) continue;
-            numUsers.insert(user);
+          bool isBroadcast = false;
+          if (auto genericOp =
+                  dyn_cast<linalg::GenericOp>(producer.getOwner())) {
+            bool parallelOp =
+                llvm::all_of(genericOp.iterator_types(), [](Attribute attr) {
+                  return attr.cast<StringAttr>().getValue() ==
+                         getParallelIteratorTypeName();
+                });
+            if (parallelOp) {
+              for (OpOperand *opOperand : genericOp.getInputOperands()) {
+                AffineMap indexingMap = genericOp.getTiedIndexingMap(opOperand);
+                if (indexingMap.isProjectedPermutation() &&
+                    indexingMap.getNumDims() != indexingMap.getNumResults()) {
+                  isBroadcast = true;
+                  break;
+                }
+              }
+            }
           }
-          return numUsers.empty();
+          // Only fuse if it has a single linalg generic user. It is a
+          // simplistic heuristic to avoid duplicating ops that may be
+          // expensive.
+          // TODO: Add a cost model to allow ops to be duplicated.
+          if (!isBroadcast && !isa<ConstantOp>(producer.getOwner()) &&
+              !llvm::hasSingleElement(producer.getUsers()))
+            return false;
+          return llvm::all_of(producer.getUsers(), [](Operation *user) {
+            return isa<linalg::GenericOp>(user);
+          });
         };
     // Simple heuristic to decide if reshaope should be folded in the linalg.
     // If the source of the reshape is a linalg op fold to potentially allow the
diff --git a/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
index d005ef7..db00679 100644
--- a/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -117,13 +117,13 @@
       mlir::createLinalgFoldUnitExtentDimsPass());
   passManager.addNestedPass<mlir::FuncOp>(createInterchangeGenericOpsPass());
   passManager.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
   passManager.addNestedPass<mlir::FuncOp>(createFusionOfTensorOpsPass());
   passManager.addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
   if (clEnableLinalgDetensorize) {
     passManager.addNestedPass<mlir::FuncOp>(
         mlir::createLinalgDetensorizePass());
   }
-  passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
   passManager.addNestedPass<mlir::FuncOp>(
       createConvertToFlowBeforeDispatchFormation());
   passManager.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
index 0dc7904..2d35b8a 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
@@ -1073,3 +1073,26 @@
 //       CHECK:         scf.for %[[X:.+]] =
 //       CHECK:           %[[POOL:.+]] = linalg.pooling_nhwc_sum
 //       CHECK:           flow.dispatch.tensor.store %[[POOL]], %[[OUTPUT]], offsets = [0, %[[Z]], %[[Y]], %[[X]]], sizes = [1, %{{.+}}, %{{.+}}, %{{.+}}]
+
+// -----
+
+func @named_op_outs_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %cst1 = constant -1.0 : f64
+  %cstm1 = constant 1.0 : f64
+  %c12345 = constant 12345 : i32
+  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %d1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %init = linalg.init_tensor [%d0, %d1] : tensor<?x?xf32>
+  %fill = linalg.fill_rng_2d ins(%cst1, %cstm1, %c12345 : f64, f64, i32)
+      outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %matmul = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %matmul : tensor<?x?xf32>
+}
+// CHECK-LABEL: func @named_op_outs_fusion
+//       CHECK:   flow.dispatch.workgroups
+//       CHECK:     %[[FILL:.+]] = linalg.fill_rng_2d
+//       CHECK:     linalg.matmul
+//  CHECK-SAME:       outs(%[[FILL]] : tensor<?x?xf32>)
diff --git a/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
index 8d63e0a..d6834d0 100644
--- a/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
@@ -34,6 +34,12 @@
 static llvm::cl::opt<bool> dumpPtx("iree-cuda-dump-ptx", llvm::cl::init(false),
                                    llvm::cl::desc("Dump ptx"));
 
+static llvm::cl::opt<bool> clDisableLoopNounrollWa(
+    "iree-hal-cuda-disable-loop-nounroll-wa",
+    llvm::cl::desc(
+        "Disable the workaround for bug in ptxas for CUDA version before 11.4"),
+    llvm::cl::init(false));
+
 namespace mlir {
 namespace iree_compiler {
 namespace IREE {
@@ -51,7 +57,9 @@
     // the no unroll metadata. This bug is fixed in cuda 11.4 but since we still
     // run on older driver we need to keep it.
     // TODO(thomasraoux): Remove it once we stop supporting older drivers.
-    codegenPasses.add(llvm::createSetNoUnrollPass());
+    if (!clDisableLoopNounrollWa) {
+      codegenPasses.add(llvm::createSetNoUnrollPass());
+    }
     targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
                                       llvm::CGFT_AssemblyFile);
     codegenPasses.run(module);
diff --git a/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp b/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp
index a06b026..4ae4556 100644
--- a/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp
+++ b/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.cpp
@@ -759,6 +759,69 @@
   return tiledFftOp;
 }
 
+//===----------------------------------------------------------------------===//
+// ReverseOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyReverseOp(ReverseOp op) {
+  if (op.getNumInputs()) {
+    return op.emitOpError("expected no inputs");
+  }
+  if (op.getNumOutputs() != 1) {
+    return op.emitOpError("expected exactly one output");
+  }
+
+  int64_t rank = op.getOperandRank();
+  int dimension = op.dimension();
+  if (dimension < 0 || dimension >= rank) {
+    return op.emitOpError("dimension must be within (0, ") << rank << "]";
+  }
+
+  return success();
+}
+
+bool ReverseOp::payloadUsesValueFromOperand(OpOperand *) { return false; }
+
+SmallVector<StringRef> ReverseOp::getLoopIteratorTypes() {
+  SmallVector<StringRef> iteratorTypes(getOperandRank(),
+                                       getParallelIteratorTypeName());
+  return iteratorTypes;
+}
+
+SmallVector<Range> ReverseOp::getLoopBounds(OpBuilder &builder) {
+  Location loc = getLoc();
+  Value zero = builder.create<ConstantIndexOp>(loc, 0);
+  Value one = builder.create<ConstantIndexOp>(loc, 1);
+  SmallVector<Range> ranges;
+  for (auto dim : llvm::seq<int64_t>(0, getOperandRank())) {
+    Value ub = getDimValue(builder, loc, operand(), dim);
+    ranges.emplace_back(Range{zero, ub, one});
+  }
+  auto dim = dimension();
+  ranges[dim].size = builder.create<SignedDivIOp>(
+      loc, ranges[dim].size, builder.create<ConstantIndexOp>(loc, 2));
+  return ranges;
+}
+
+LogicalResult ReverseOp::generateScalarImplementation(OpBuilder &b,
+                                                      Location loc,
+                                                      ValueRange ivs) {
+  SmallVector<Value> mirrorIndices(ivs.begin(), ivs.end());
+  auto dim = dimension();
+  auto size = getDimValue(b, loc, operand(), dim);
+  size = b.create<SubIOp>(loc, size, b.create<ConstantIndexOp>(loc, 1));
+  mirrorIndices[dim] = b.create<SubIOp>(loc, size, mirrorIndices[dim]);
+
+  // for (int i = 0; i < n / 2; ++i) {
+  //   swap(array[i], array[n - 1 - i]);
+  // }
+  Value v1 = b.create<memref::LoadOp>(loc, operand(), ivs);
+  Value v2 = b.create<memref::LoadOp>(loc, operand(), mirrorIndices);
+  b.create<memref::StoreOp>(loc, v1, operand(), mirrorIndices);
+  b.create<memref::StoreOp>(loc, v2, operand(), ivs);
+  return success();
+}
+
 #define DEFINE_OP_GET_EFFECTS(OP_NAME)                                    \
   void OP_NAME::getEffects(                                               \
       SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>> \
@@ -772,6 +835,7 @@
 DEFINE_OP_GET_EFFECTS(ScatterOp)
 DEFINE_OP_GET_EFFECTS(SortOp)
 DEFINE_OP_GET_EFFECTS(FftOp)
+DEFINE_OP_GET_EFFECTS(ReverseOp)
 
 }  // namespace linalg_ext
 }  // namespace iree_compiler
diff --git a/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td b/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td
index fe847fb..9841da1 100644
--- a/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td
+++ b/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td
@@ -245,6 +245,49 @@
   }];
 }
 
+def LinalgExt_ReverseOp : LinalgExt_Op<"reverse", [
+  DeclareOpInterfaceMethods<TiledOpInterface, ["generateScalarImplementation"]>,
+  DeclareOpInterfaceMethods<LinalgExtInterface,
+                            // ReverseOp does not have a region, so we have to
+                            // overwrite the method.
+                            ["payloadUsesValueFromOperand"]>]> {
+  let summary = "Reverse operator";
+  let description = [{
+    A temporary solution of a reverse op. The loop bound of the reverse
+    dimension is half of the shape because we can simply swap elements. E.g.,
+
+    for (int i = 0; i < n / 2; ++i) {
+      std::swap(a[i], a[n - 1 - i]);
+    }
+  }];
+
+  let arguments = (ins Variadic<AnyShaped>:$inputs,
+                       Variadic<AnyShaped>:$outputs,
+                       I64Attr:$dimension
+  );
+  let results = (outs Variadic<AnyRankedTensor>:$results);
+  let assemblyFormat = [{
+    `dimension` `(` $dimension `)`
+    attr-dict (`ins` `(` $inputs^ `:` type($inputs) `)`)?
+    `outs` `(` $outputs `:` type($outputs) `)`
+    (`:` type($results)^)?
+  }];
+  let extraClassDeclaration = extraLinalgExtOpClassDeclaration # [{
+    Value operand() {
+      return getOutputOperand(0)->get();
+}
+    ShapedType getOperandType() {
+      return operand().getType().cast<ShapedType>();
+    }
+    int64_t getOperandRank() {
+      return getOperandType().getRank();
+    }
+    ArrayRef<int64_t> getOperandShape() {
+      return getOperandType().getShape();
+    }
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Pure ops
 //===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Dialect/LinalgExt/IR/test/roundtrip.mlir b/iree/compiler/Dialect/LinalgExt/IR/test/roundtrip.mlir
index b2deb0a..e488bc1 100644
--- a/iree/compiler/Dialect/LinalgExt/IR/test/roundtrip.mlir
+++ b/iree/compiler/Dialect/LinalgExt/IR/test/roundtrip.mlir
@@ -387,3 +387,42 @@
 //  CHECK-SAME:    outs(%[[REAL]], %[[IMAG]] : tensor<1024xf32>, tensor<1024xf32>)
 //  CHECK-SAME:   : tensor<1024xf32>, tensor<1024xf32>
 //       CHECK:   return %[[RES]]#0, %[[RES]]#1
+
+// -----
+
+func @reverse_tensor(%arg0: tensor<3x5xi32>) -> tensor<3x5xi32> {
+  %0 = linalg_ext.reverse
+         dimension(0)
+         outs(%arg0 : tensor<3x5xi32>) : tensor<3x5xi32>
+  return %0 : tensor<3x5xi32>
+}
+// CHECK-LABEL: func @reverse_tensor
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor<3x5xi32>
+//       CHECK:   %[[RESULT:.+]] = linalg_ext.reverse dimension(0)
+//  CHECK-SAME:      outs(%[[ARG0]]
+
+// -----
+
+func @reverse_memref(%arg0: memref<3x5xi32>) {
+  linalg_ext.reverse
+    dimension(0)
+    outs(%arg0 : memref<3x5xi32>)
+  return
+}
+// CHECK-LABEL: func @reverse_memref
+//  CHECK-SAME:   %[[ARG0:.+]]: memref<3x5xi32>
+//       CHECK:   linalg_ext.reverse dimension(0)
+//  CHECK-SAME:      outs(%[[ARG0]]
+
+// -----
+
+func @reverse_dynamic_tensor(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %0 = linalg_ext.reverse
+         dimension(1)
+         outs(%arg0 : tensor<?x?xi32>) : tensor<?x?xi32>
+  return %0 : tensor<?x?xi32>
+}
+// CHECK-LABEL: func @reverse_dynamic_tensor
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xi32>
+//       CHECK:   %[[RESULT:.+]] = linalg_ext.reverse dimension(1)
+//  CHECK-SAME:      outs(%[[ARG0]]
diff --git a/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_loops.mlir b/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_loops.mlir
index 423faca..84883b9 100644
--- a/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_loops.mlir
+++ b/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_loops.mlir
@@ -481,3 +481,29 @@
 // CHECK:               %[[RES3:.+]] = subf %[[L_REAL]], %[[T_REAL]]
 // CHECK:               %[[RES4:.+]] = subf %[[L_IMAG]], %[[T_IMAG]]
 // CHECK:               linalg.yield %[[RES1]], %[[RES2]], %[[RES3]], %[[RES4]]
+
+// -----
+
+func @reverse_dim_0(%arg0: memref<?x?xi32>) {
+  linalg_ext.reverse
+    dimension(0)
+    outs(%arg0 : memref<?x?xi32>)
+  return
+}
+// CHECK-LABEL: func @reverse_dim_0
+// CHECK-SAME:    %[[BUF:[a-zA-Z0-9]+]]
+// CHECK-DAG:     %[[C0:.+]] = constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = constant 1 : index
+// CHECK-DAG:     %[[C2:.+]] = constant 2 : index
+// CHECK-DAG:     %[[D0:.+]] = memref.dim %arg0, %c0 : memref<?x?xi32>
+// CHECK-DAG:     %[[D1:.+]] = memref.dim %arg0, %c1 : memref<?x?xi32>
+// CHECK-DAG:     %[[REV_UB:.+]] = divi_signed %[[D0]], %[[C2]] : index
+// CHECK:         scf.for %[[I:.+]] = %[[C0]] to %[[REV_UB]] step %[[C1]]
+// CHECK:           scf.for %[[J:.+]] = %[[C0]] to %[[D1]] step %[[C1]]
+// CHECK:             %[[T0:.+]] = memref.dim %[[BUF]], %[[C0]]
+// CHECK:             %[[T1:.+]] = subi %[[T0]], %[[C1]] : index
+// CHECK:             %[[T2:.+]] = subi %[[T1]], %[[I]] : index
+// CHECK:             %[[V0:.+]] = memref.load %[[BUF]][%[[I]], %[[J]]]
+// CHECK:             %[[V1:.+]] = memref.load %[[BUF]][%[[T2]], %[[J]]]
+// CHECK:             memref.store %[[V0]], %[[BUF]][%[[T2]], %[[J]]] : memref<?x?xi32>
+// CHECK:             memref.store %[[V1]], %[[BUF]][%[[I]], %[[J]]] : memref<?x?xi32>