iree/compiler/Codegen/Common/LinalgBufferizePass.cpp - 3p/openxla/iree - Git at Google

 // Copyright 2020 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //===- LinalgBufferizePass.cpp.cpp - Pass to bufferize Linalg on tensors --===//
 //
 // The overall bufferizarion algorithm is summarized here. Each of the
 // individual steps are explained in detail later.
 //
 // Problem statement:
 //
 // The bufferization in this file is intended for converting tensor-operations
 // into memref-operations for ops within a dispatch region. The goal is to reuse
 // the buffers provided as inputs/outputs by the hal layer as memrefs for each
 // of the operations. If the transformation cannot reuse input/output buffer to
 // store an intermediate tensor, an allocation is done. This allocation is
 // typically meant to be to target scratchspace memory.
 //
 // The algorithm has two phases an analysis phase and a tranformation phase.
 //
 // - The analysis phase walks the function and organizes relevant tensors
 //   (tensors that need to be converted to memrefs) into equivalence clases. Two
 //   tensors are part of the same equivalence class if they can eventually be
 //   mapped to the same memref. This allows determining which operations can use
 //   the buffer provided for the outputs to compute the results in place.
 // - The transformation phase walks the function again and inserts corresponding
 //   memref operations. The tensor operations are still kept around since the
 //   analysis driving the transformation is based on the tensor values.
 //   - Converting tensor operations to memref operations when all operands use
 //     either buffers that are inputs to the dispatch or are allocated
 //     temporarily within the dispatch region can be achieved by a
 //     straight-forward walk.
 //   - Reusing memref for the result of the dispatch for operations is more
 //     involved and explained below.
 //
 //===----------------------------------------------------------------------===//

 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowTypes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/Shape/IR/ShapeOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"

 #define DEBUG_TYPE "iree-codegen-linalg-bufferize"

 namespace mlir {
 namespace iree_compiler {

 //===----------------------------------------------------------------------===//
 // Analysis to compute equivalence sets.
 //
 // These functions compute the equivalence relationships between all tensors in
 // the program. Two tensors are equivalent if they are to be mapped to the same
 // buffer. For every operation, based on the operation semantics the result of
 // the operation can reuse the buffer for an operand of the operation. This
 // information is captured by adding these two tensors to the same equivalence
 // class. Eventually the result of the dispatch tensor is added to some
 // equivalence set. All tensors in that equivalence set can reuse the result
 // buffer and compute the values in place. You can add tensors to equivalence
 // set only if
 // - They have a single use
 // - They are derived from a read-only buffer.
 //
 //===----------------------------------------------------------------------===//

 /// Check if all users of an op that lowers to a subview eventually can use the
 /// subview when converted to buffers. For example `linalg.reshape` (which is
 /// the buffer version of `linalg.tensor_reshape`) cannot handle subviews.
 static bool canUsersHandleSubviews(Operation *op) {
   // TODO(ravishankarm): Maybe this is too aggressive, might have to switch this
   // to have a white-list instead of blacklist.
   for (Operation *user : op->getUsers()) {
     if (isa<IREE::Flow::DispatchTensorStoreOp, linalg::TensorCollapseShapeOp,
             linalg::TensorExpandShapeOp>(user)) {
       return false;
     }
   }
   return true;
 }

 /// Class that tracks the equivalence relationship between tensors. Its a
 /// light-weight wrapper around `llvm::EquivalenceClasses` to account for
 /// `Value` not directly supported as a value type by this class.
 class BufferizationPlan {
  public:
   llvm::EquivalenceClasses<void *>::iterator findValue(Value v) const {
     return mappedTensors.findValue(getPointer(v));
   }

   llvm::EquivalenceClasses<void *>::iterator end() const {
     return mappedTensors.end();
   }

   SmallVector<Value> getTensorsMappedToSameSet(Value v) const {
     SmallVector<Value> tensors;
     for (auto it = mappedTensors.findLeader(getPointer(v)),
               ie = mappedTensors.member_end();
          it != ie; ++it) {
       tensors.push_back(getValue(*it));
     }
     return tensors;
   }

   bool isEquivalent(Value v1, Value v2) const {
     return mappedTensors.isEquivalent(getPointer(v1), getPointer(v2));
   }

   void insert(Value v) { mappedTensors.insert(getPointer(v)); }

   void unionSets(Value v1, Value v2) {
     mappedTensors.unionSets(getPointer(v1), getPointer(v2));
   }

   /// Sets the equivalance class that contains `v` as the set that contains the
   /// result tensor of the dispatch region (i.e. a tensor that is the `value`
   /// operand of a flow.dispatch.tensor.store` op). All operations in this
   /// equivalence class can use the result buffer of the dispatch region to
   /// compute their values in place.
   void storeSet(Value v) { storeLeaders.insert(getLeaderValue(v)); }

   /// Queries if the value `v` is in the same equivalence class as the result of
   /// the dispatch region.
   bool isInStoreSet(Value v) { return storeLeaders.count(getLeaderValue(v)); }

   void dump() {
     llvm::dbgs() << "BufferMappings : \n";
     unsigned numSets = 0;
     for (auto it = mappedTensors.begin(), ie = mappedTensors.end(); it != ie;
          ++it) {
       if (!it->isLeader()) continue;
       llvm::dbgs() << "\tSet " << numSets << ":\n";
       for (auto member : llvm::make_range(mappedTensors.member_begin(it),
                                           mappedTensors.member_end())) {
         llvm::dbgs() << "\t\t";
         getValue(member).print(llvm::dbgs());
         llvm::dbgs() << "\n";
       }
       numSets++;
     }
   }

  private:
   Value getLeaderValue(Value v1) const {
     return getValue(mappedTensors.getLeaderValue(getPointer(v1)));
   }

   void *getPointer(Value v) const { return v.getAsOpaquePointer(); }

   Value getValue(void *v) const { return Value::getFromOpaquePointer(v); }

   llvm::EquivalenceClasses<void *> mappedTensors;

   /// Leaders of the sets that contain the result tensor of the dispatch
   /// region, i.e. a tensor that is the `value` operand of a
   /// flow.dispatch.tensor.store` op
   llvm::DenseSet<Value> storeLeaders;
 };

 /// Walks the use-def chain and see if this value comes from a read-only tensor.
 static bool isFromReadOnlyTensor(Value v, const BufferizationPlan &plan) {
   auto definingOp = v.getDefiningOp();
   if (!definingOp) {
     auto arg = v.cast<BlockArgument>();
     return TypeSwitch<Operation *, bool>(arg.getOwner()->getParentOp())
         .Case<scf::ForOp>([&](scf::ForOp forOp) {
           Value initOperand = forOp.getOpOperandForRegionIterArg(arg).get();
           if (plan.isEquivalent(arg, initOperand)) {
             return isFromReadOnlyTensor(initOperand, plan);
           }
           return false;
         })
         .Default([&](Operation *op) { return false; });
   }
   return TypeSwitch<Operation *, bool>(definingOp)
       .Case<ConstantOp>([&](ConstantOp constantOp) { return true; })
       .Case<linalg::TensorCollapseShapeOp, linalg::TensorExpandShapeOp>(
           [&](auto op) { return isFromReadOnlyTensor(op.src(), plan); })
       .Case<tensor::ExtractSliceOp>([&](tensor::ExtractSliceOp sliceOp) {
         return isFromReadOnlyTensor(sliceOp.source(), plan);
       })
       .Case<IREE::Flow::DispatchTensorLoadOp>(
           [&](IREE::Flow::DispatchTensorLoadOp loadOp) {
             return loadOp.source()
                        .getType()
                        .cast<IREE::Flow::DispatchTensorType>()
                        .getAccess() == IREE::Flow::TensorAccess::ReadOnly;
           })
       .Default([&](Operation *op) { return false; });
 }

 /// Adds the result of `std.constant` to its set (there is nothing to tie to
 /// here).
 static LogicalResult analyseConstantOp(ConstantOp constantOp,
                                        BufferizationPlan &plan) {
   if (!constantOp.getResult().getType().isa<ShapedType>()) return success();
   plan.insert(constantOp.getResult());
   return success();
 }

 /// Adds the result of the `flow.dispatch.tensor.load` op to the same
 /// equivalence class as the source.
 static LogicalResult analyseInterfaceLoadTensorOp(
     IREE::Flow::DispatchTensorLoadOp loadOp, BufferizationPlan &plan) {
   if (!(loadOp.getMixedOffsets().empty() && loadOp.getMixedSizes().empty() &&
         loadOp.getMixedStrides().empty()) &&
       !canUsersHandleSubviews(loadOp)) {
     plan.insert(loadOp.source());
     plan.insert(loadOp.result());
     return success();
   }
   plan.unionSets(loadOp.result(), loadOp.source());
   return success();
 }

 /// Helper method to returns an operation of type `OpType` whose result is in
 /// the same equivalence set as `value`. Returns an operation if there is only
 /// one such op in the equivalence set or nullptr in all other cases.
 template <typename OpType>
 static OpType getEquivalentOpOfType(Value value, BufferizationPlan &plan) {
   OpType equivalentOp;
   SmallVector<Value> mappedTensors = plan.getTensorsMappedToSameSet(value);
   for (auto v : mappedTensors) {
     auto definingOp = v.getDefiningOp<OpType>();
     if (!definingOp) continue;
     assert((!equivalentOp || equivalentOp == definingOp) &&
            "found two interface binding ops marked as equivalent");
     if (!equivalentOp) equivalentOp = definingOp;
   }
   return equivalentOp;
 }

 /// Returns true if the value and target of a `flow.dispatch.tensor.store`
 /// operation can be added to the same equivalence set. This can be done only if
 /// - The `value` is not from a equivalence set that contains a read-only
 ///   tensor.
 /// - All `hal.interface.binding.subspan` operations in the equivalence class of
 ///   `value` and `target` have the same binding and offset. For now, it is
 ///   assumed that the equivalence classes contain only 1 such instruction.
 /// This method asserts that the `target` equivalence class already contains a
 /// `hal.interface.binding.subspan` op.'
 static bool canSetStoreValueAndTargetAsEquivalent(
     IREE::Flow::DispatchTensorStoreOp storeOp, BufferizationPlan &plan) {
   Value value = storeOp.value();
   if (!(storeOp.getMixedOffsets().empty() && storeOp.getMixedSizes().empty() &&
         storeOp.getMixedStrides().empty())) {
     SmallVector<Value> mappedTensors = plan.getTensorsMappedToSameSet(value);
     for (auto v : mappedTensors) {
       // TODO(ravishankarm): At this point it is not clear why the following
       // restriction exists. It might have something to do with subviews and
       // reshapes not working well together, but there is no comment about why
       // this was added with the change that added this.
       Operation *op = v.getDefiningOp();
       if (op && isa<linalg::TensorCollapseShapeOp, linalg::TensorExpandShapeOp>(
                     v.getDefiningOp())) {
         return false;
       }
     }
   }

   Value target = storeOp.target();
   auto targetInterfaceOp =
       getEquivalentOpOfType<IREE::HAL::InterfaceBindingSubspanOp>(target, plan);
   assert(targetInterfaceOp);
   if (auto valueConstantOp = getEquivalentOpOfType<ConstantOp>(value, plan)) {
     return false;
   }
   if (auto valueInterfaceOp =
           getEquivalentOpOfType<IREE::HAL::InterfaceBindingSubspanOp>(value,
                                                                       plan)) {
     if (targetInterfaceOp.binding() != valueInterfaceOp.binding() ||
         targetInterfaceOp.byte_offset() != valueInterfaceOp.byte_offset()) {
       // If the binding and offsets are different, map these to different
       // memrefs.
       return false;
     }
     // If the binding and offsets are the same, make sure that the
     // !flow.dispatch.tensor is read-write.
     auto sourceType =
         valueInterfaceOp.getType().dyn_cast<IREE::Flow::DispatchTensorType>();
     return sourceType &&
            sourceType.getAccess() == IREE::Flow::TensorAccess::ReadWrite;
   }
   return true;
 }

 /// Tries to add the `value` and `target` to the same equivalence class.
 static LogicalResult analyseInterfaceStoreTensorOp(
     IREE::Flow::DispatchTensorStoreOp storeOp, BufferizationPlan &plan) {
   // The value and target can be union-ed if the set the value is part of does
   // not contain any hal.interface.binding.subspan from a different binding.
   Value value = storeOp.value();
   Value target = storeOp.target();
   if (!getEquivalentOpOfType<IREE::HAL::InterfaceBindingSubspanOp>(target,
                                                                    plan)) {
     return storeOp.emitError(
         "expected target of store op to already be added to an equivalence "
         "set");
   }
   if (canSetStoreValueAndTargetAsEquivalent(storeOp, plan)) {
     plan.unionSets(value, target);
   } else {
     plan.insert(value);
   }
   plan.storeSet(target);
   return success();
 }

 static LogicalResult analyseInterfaceBindingSubspanOp(
     IREE::HAL::InterfaceBindingSubspanOp subspanOp, BufferizationPlan &plan) {
   plan.insert(subspanOp.getResult());
   return success();
 }

 static LogicalResult analysePadTensorOp(linalg::PadTensorOp padTensorOp,
                                         BufferizationPlan &plan) {
   plan.insert(padTensorOp.source());
   plan.insert(padTensorOp.result());
   return success();
 }

 /// For every result of the LinalgOp, gets the operands (`ins` or `outs`) whose
 /// buffer can be reused for the result.
 static SmallVector<Value> getTiedOperandsForLinalgOps(
     linalg::LinalgOp linalgOp, const BufferizationPlan &plan) {
   SmallVector<Value> tiedOperands(linalgOp.getOperation()->getNumResults());
   auto outputOperands = linalgOp.getOutputOperands();
   for (auto outTensor : llvm::enumerate(outputOperands)) {
     if (linalgOp.payloadUsesValueFromOperand(outTensor.value())) {
       // If the `outs` tensor has a single use (this op) and is not from a
       // read-only buffer, the `outs` tensor can be tied to the result.
       if (outTensor.value()->get().hasOneUse() &&
           !isFromReadOnlyTensor(outTensor.value()->get(), plan)) {
         tiedOperands[outTensor.index()] = outTensor.value()->get();
       }
     }
   }
   for (auto result : llvm::enumerate(outputOperands)) {
     // If the output tensor is not actually used (for initialization) by this
     // op, we can reuse the result tensor's buffer for some operands.
     // TODO(#5040): A better way to handle this case is to allocate a buffer and
     // then vectorization + load-store forwarding to remove the intermediate
     // buffer. This requires vectorization to handle all cases downstream. This
     // is a WAR for current use cases.
     if (linalgOp.payloadUsesValueFromOperand(result.value())) {
       continue;
     }
     for (auto input : linalgOp.getInputTensorOperands()) {
       auto producerOp = input->get().getDefiningOp<linalg::LinalgOp>();
       if (producerOp && input->get().hasOneUse() &&
           input->get().getType() == result.value()->get().getType() &&
           linalgOp.getTiedIndexingMap(input) ==
               linalgOp.getTiedIndexingMap(result.value())) {
         assert(!tiedOperands[result.index()]);
         tiedOperands[result.index()] = input->get();
         break;
       }
     }
   }
   return tiedOperands;
 }

 static LogicalResult analyseLinalgExtOps(linalg_ext::LinalgExtOp op,
                                          BufferizationPlan &plan) {
   if (!op.hasTensorSemantics()) return success();
   // TODO(hanchung): Revisit if we can tie together op.getOutputOperands() with
   // the corresponding op.getInputOperands(). For now we have limit LinalgExt
   // ops, and there is no use case. So we ignore it.
   // Note: this is what should be done for LinalgOps, except for a what is done
   // for operand fusion today.
   for (auto input : op.getInputOperands()) {
     plan.insert(input->get());
   }
   for (auto output : op.getOutputOperands()) {
     plan.insert(output->get());
   }
   for (auto result : op->getResults()) {
     plan.insert(result);
   }
   return success();
 }

 /// Adds the corresponding `outs` and result tensors of the linalg op into the
 /// same equivalence class.
 static LogicalResult analyseLinalgOps(linalg::LinalgOp linalgOp,
                                       BufferizationPlan &plan) {
   if (!linalgOp.hasTensorSemantics()) return success();
   auto results = linalgOp->getResults();
   auto tiedOperands = getTiedOperandsForLinalgOps(linalgOp, plan);
   for (auto it : llvm::enumerate(llvm::zip(results, tiedOperands))) {
     Value resultTensor = std::get<0>(it.value());
     Value tiedOperand = std::get<1>(it.value());
     if (tiedOperand) {
       plan.unionSets(resultTensor, tiedOperand);
     }
     plan.insert(linalgOp.getOutputOperand(it.index())->get());
     plan.insert(resultTensor);
   }
   return success();
 }

 /// Returns true if there is a single use of the `value` that is "real",
 /// i.e. where the value itself is used, and not the type of the value. For
 /// example, a use in a `memref.dim` is only looking at the type and not the
 /// value.
 static bool hasSingleRealUse(Value value) {
   int numUsers = 0;
   for (OpOperand &use : value.getUses()) {
     if (!isa<memref::DimOp, tensor::DimOp>(use.getOwner())) {
       numUsers++;
     }
   }
   return numUsers == 1;
 }

 /// For operations that have a single operand and result, adds both to the same
 /// equivalence class.
 static LogicalResult analyseSingleOperandResultOp(Value source, Value result,
                                                   BufferizationPlan &plan) {
   if (hasSingleRealUse(source) || isFromReadOnlyTensor(source, plan)) {
     plan.unionSets(source, result);
     return success();
   }
   plan.insert(source);
   plan.insert(result);
   return success();
 }

 static LogicalResult analyseSubTensorOp(tensor::ExtractSliceOp subTensorOp,
                                         BufferizationPlan &plan) {
   if (!canUsersHandleSubviews(subTensorOp)) {
     plan.insert(subTensorOp.source());
     plan.insert(subTensorOp.result());
     return success();
   }
   return analyseSingleOperandResultOp(subTensorOp.source(),
                                       subTensorOp.result(), plan);
 }

 /// Adds the `dest` and `result` tensor of a subtensor insert operation into the
 /// same equivalence class. If `source` is not null also checks that the
 /// `source` and `dest` are not equivalent.
 static LogicalResult analyseDestructiveUpdateOp(Operation *op, Value source,
                                                 Value dest, Value result,
                                                 BufferizationPlan &plan) {
   if (hasSingleRealUse(dest) && !isFromReadOnlyTensor(dest, plan)) {
     plan.unionSets(dest, result);
   } else if (source && plan.isEquivalent(source, dest)) {
     // The destructive update pattern can put the source and dest in the same
     // equivalence class, but that is checked explicitly later on. So at this
     // stage this shouldnt happen.
     return op->emitError(
         "unexpected source and dest being equivalent in destructive update op");
   }
   plan.insert(dest);
   plan.insert(result);
   return success();
 }

 static LogicalResult analyseScfIfOp(scf::IfOp ifOp, BufferizationPlan &plan) {
   if (!ifOp.getNumResults()) return success();
   for (auto it : llvm::zip(ifOp.getResults(), ifOp.thenYield().getOperands(),
                            ifOp.elseYield().getOperands())) {
     Value result = std::get<0>(it);
     if (!result.getType().isa<RankedTensorType>()) continue;
     // All results and yields of the if-then-else are tied together.
     plan.unionSets(result, std::get<1>(it));
     plan.unionSets(result, std::get<2>(it));
   }
   return success();
 }

 static LogicalResult analyseScfForOp(scf::ForOp forOp,
                                      BufferizationPlan &plan) {
   if (forOp.results().empty()) return success();
   if (!llvm::all_of(forOp->getResultTypes(), [](Type resultType) {
         return resultType.isa<RankedTensorType>();
       })) {
     return success();
   }

   auto yeildOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
   auto regionArgs = forOp.getRegionIterArgs();
   auto initArgs = forOp.initArgs();
   for (int i = 0; i < yeildOp.results().size(); ++i) {
     Value yieldTensor = yeildOp.results()[i];
     Value resultTensor = forOp.results()[i];
     Value initArg = initArgs[i];
     Value arg = regionArgs[i];
     // Always tie the yield, the result tensor, and the region arg
     plan.unionSets(yieldTensor, resultTensor);
     plan.unionSets(yieldTensor, arg);

     // If the init value is not read-only and has single use, the tie the init
     // and result (and by extension all 4 tensors here).
     if (hasSingleRealUse(initArg) && !isFromReadOnlyTensor(initArg, plan)) {
       plan.unionSets(initArg, resultTensor);
     }
   }
   return success();
 }

 /// Look for destructive update loop pattern.
 ///
 /// ```mlir
 ///   %result = scf.for %arg0 = ... iter_args(%arg1 = %init) {
 ///     %st = subtensor %arg1[...]
 ///
 ///     %yieldVal = tensor.insert_slice %val, %arg1[...]
 ///     scf.yield %yieldVal
 ///   }
 ///
 /// `%result`, `%arg1` and `%yieldVal` are all already in the same equivalence
 /// class. `%st` and `%arg` can be added to the same equivalence class even
 /// though `%arg1` has multiple uses. Same is true for `%yieldVal` and
 /// `%arg1`. Here we also verify there are no other "value" uses of
 /// `%arg1`. This might be overly constraining, but we can relax gradually.
 static LogicalResult hasDestructiveUpdateLoopPattern(scf::ForOp forOp,
                                                      BufferizationPlan &plan) {
   for (BlockArgument arg : forOp.getRegionIterArgs()) {
     auto isDestructiveUpdateUses = [&](OpOperand &use) -> bool {
       Operation *user = use.getOwner();
       return TypeSwitch<Operation *, bool>(user)
           .Case<tensor::ExtractSliceOp>([&](tensor::ExtractSliceOp sliceOp) {
             return sliceOp.source() == arg;
           })
           .Case<tensor::InsertSliceOp>(
               [&](tensor::InsertSliceOp subTensorInsertOp) {
                 return subTensorInsertOp.dest() == arg;
               })
           .Case<memref::DimOp, scf::YieldOp, tensor::DimOp>(
               [&](auto op) { return true; })
           .Default([&](Operation *op) { return false; });
     };
     if (llvm::all_of(arg.getUses(), isDestructiveUpdateUses)) {
       for (Operation *user : arg.getUsers()) {
         TypeSwitch<Operation *>(user)
             .Case<tensor::ExtractSliceOp>([&](tensor::ExtractSliceOp sliceOp) {
               plan.unionSets(sliceOp.source(), sliceOp.result());
             })
             .Case<tensor::InsertSliceOp>(
                 [&](tensor::InsertSliceOp subTensorInsertOp) {
                   if (!isFromReadOnlyTensor(subTensorInsertOp.source(), plan)) {
                     plan.unionSets(subTensorInsertOp.source(),
                                    subTensorInsertOp.dest());
                   }
                 })
             .Default([&](Operation *) {});
       }
     }
   }
   return success();
 }

 static LogicalResult analyseOperations(FuncOp funcOp, BufferizationPlan &plan) {
   auto bufferMappingFn = [&](Operation *op) -> WalkResult {
     return TypeSwitch<Operation *, LogicalResult>(op)
         .Case<ConstantOp>([&](ConstantOp constantOp) {
           return analyseConstantOp(constantOp, plan);
         })
         .Case<IREE::Flow::DispatchTensorLoadOp>(
             [&](IREE::Flow::DispatchTensorLoadOp loadOp) {
               return analyseInterfaceLoadTensorOp(loadOp, plan);
             })
         .Case<IREE::Flow::DispatchTensorStoreOp>(
             [&](IREE::Flow::DispatchTensorStoreOp storeOp) {
               return analyseInterfaceStoreTensorOp(storeOp, plan);
             })
         .Case<IREE::Flow::DispatchTieShapeOp>(
             [&](IREE::Flow::DispatchTieShapeOp tieShapeOp) {
               return analyseSingleOperandResultOp(tieShapeOp.operand(),
                                                   tieShapeOp.result(), plan);
             })
         .Case<IREE::HAL::InterfaceBindingSubspanOp>(
             [&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) {
               return analyseInterfaceBindingSubspanOp(subspanOp, plan);
             })
         .Case<linalg::PadTensorOp>([&](linalg::PadTensorOp padTensorOp) {
           return analysePadTensorOp(padTensorOp, plan);
         })
         .Case<linalg::LinalgOp>([&](linalg::LinalgOp linalgOp) {
           return analyseLinalgOps(linalgOp, plan);
         })
         .Case<linalg_ext::LinalgExtOp>(
             [&](linalg_ext::LinalgExtOp linalgExtOp) {
               return analyseLinalgExtOps(linalgExtOp, plan);
             })
         .Case<linalg::TensorCollapseShapeOp, linalg::TensorExpandShapeOp>(
             [&](auto reshapeOp) {
               return analyseSingleOperandResultOp(reshapeOp.src(),
                                                   reshapeOp.result(), plan);
             })
         .Case<tensor::ExtractSliceOp>([&](tensor::ExtractSliceOp sliceOp) {
           return analyseSubTensorOp(sliceOp, plan);
         })
         .Case<tensor::InsertSliceOp>(
             [&](tensor::InsertSliceOp subTensorInsertOp) {
               return analyseDestructiveUpdateOp(
                   subTensorInsertOp, subTensorInsertOp.source(),
                   subTensorInsertOp.dest(), subTensorInsertOp.result(), plan);
             })
         .Case<tensor::CastOp>([&](tensor::CastOp castOp) {
           return analyseSingleOperandResultOp(castOp.source(), castOp.dest(),
                                               plan);
         })
         .Case<tensor::InsertOp>([&](tensor::InsertOp insertOp) {
           return analyseDestructiveUpdateOp(insertOp, /*source =*/nullptr,
                                             insertOp.dest(), insertOp.result(),
                                             plan);
         })
         .Case<vector::TransferReadOp>(
             [&](vector::TransferReadOp transferReadOp) {
               plan.insert(transferReadOp.source());
               return success();
             })
         .Case<vector::TransferWriteOp>(
             [&](vector::TransferWriteOp transferWriteOp) {
               return analyseDestructiveUpdateOp(transferWriteOp, nullptr,
                                                 transferWriteOp.source(),
                                                 transferWriteOp.result(), plan);
             })
         .Case<scf::IfOp>(
             [&](scf::IfOp ifOp) { return analyseScfIfOp(ifOp, plan); })
         .Case<scf::ForOp>(
             [&](scf::ForOp forOp) { return analyseScfForOp(forOp, plan); })
         .Default([&](Operation *op) { return success(); });
   };
   if (funcOp.walk<WalkOrder::PreOrder>(bufferMappingFn).wasInterrupted()) {
     return failure();
   }
   DEBUG_WITH_TYPE(DEBUG_TYPE, {
     llvm::dbgs() << "After First walk ";
     plan.dump();
   });

   if (funcOp
           .walk([&](scf::ForOp forOp) -> WalkResult {
             return hasDestructiveUpdateLoopPattern(forOp, plan);
           })
           .wasInterrupted()) {
     return failure();
   }
   DEBUG_WITH_TYPE(DEBUG_TYPE, {
     llvm::dbgs() << "After Destructive update walk ";
     plan.dump();
   });
   return success();
 }

 //===----------------------------------------------------------------------===//
 // Bufferization helper functions using BlockAndValueMapping.
 //===----------------------------------------------------------------------===//

 /// Returns the dynamic dimensions of a Value `v` that is assumed to be
 /// ShapedType.
 static SmallVector<Value, 4> getDynamicDims(OpBuilder &b, Location loc,
                                             Value v) {
   SmallVector<Value, 4> dynamicDims;
   for (auto shape : enumerate(v.getType().cast<ShapedType>().getShape())) {
     if (shape.value() == ShapedType::kDynamicSize) {
       dynamicDims.push_back(
           b.createOrFold<memref::DimOp>(loc, v, shape.index()));
     }
   }
   return dynamicDims;
 }

 /// Allocates a memref for the results of an operation. Uses the
 /// `InferShapedTypeOpInterface` where possible to get the shape of the output
 /// in terms of the shapes of the operands.
 static Value allocateBufferForResult(OpBuilder &b, Operation *op,
                                      unsigned resultNum,
                                      WorkgroupMemoryAllocationFn allocationFn) {
   assert(op->getNumResults() > resultNum);
   RankedTensorType resultType =
       op->getResult(resultNum).getType().cast<RankedTensorType>();
   SmallVector<Value, 4> dynamicDims;

   // Get the shape of the result
   Location loc = op->getLoc();
   if (auto shapedOp = dyn_cast<ReifyRankedShapedTypeOpInterface>(op)) {
     ReifiedRankedShapedTypeDims resultShape;
     if (failed(shapedOp.reifyResultShapes(b, resultShape))) {
       return nullptr;
     }
     for (auto shape : enumerate(resultShape[resultNum])) {
       if (resultType.isDynamicDim(shape.index())) {
         dynamicDims.push_back(shape.value());
       }
     }
   } else if (auto loadOp = dyn_cast<IREE::Flow::DispatchTensorLoadOp>(op)) {
     dynamicDims = llvm::to_vector<4>(loadOp.sizes());
   } else if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) {
     dynamicDims = llvm::to_vector<4>(sliceOp.sizes());
   } else if (auto subTensorInsertOp = dyn_cast<tensor::InsertSliceOp>(op)) {
     dynamicDims = getDynamicDims(b, loc, subTensorInsertOp.dest());
   } else if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op)) {
     dynamicDims = getDynamicDims(b, loc, transferWriteOp.source());
   } else {
     dynamicDims = getDynamicDims(b, loc, op->getResult(resultNum));
   }

   // If its a static allocation hoist it all the way up at begining of the
   // function.
   if (dynamicDims.empty()) {
     auto funcOp = op->getParentOfType<FuncOp>();
     OpBuilder::InsertionGuard g(b);
     b.setInsertionPointToStart(&funcOp.front());
     return allocationFn(b, loc, resultType.getShape(),
                         resultType.getElementType(), dynamicDims);
   }
   return allocationFn(b, loc, resultType.getShape(),
                       resultType.getElementType(), dynamicDims);
 }

 template <typename TensorType>
 static MemRefType getMemrefTypeForTensor(TensorType tensorType,
                                          ArrayRef<AffineMap> layout = {},
                                          unsigned memorySpace = 0) {
   return MemRefType::get(tensorType.getShape(), tensorType.getElementType(),
                          layout, memorySpace);
 }

 /// Creates a subview operation given the `src`, `offsets`, `sizes` and
 /// `strides`. Handles the corner case where the `offsets`, `sizes` and
 /// `strides` are empty in which case just forward the `src` value.
 /// TODO(ataei): Instead create memref.subview %v [][][] folder.
 static Value createSubviewOp(OpBuilder &b, Location loc, Value src,
                              ArrayRef<OpFoldResult> offsets,
                              ArrayRef<OpFoldResult> sizes,
                              ArrayRef<OpFoldResult> strides,
                              MemRefType resultType = MemRefType()) {
   if (offsets.empty() && sizes.empty() && strides.empty()) return src;
   return b.create<memref::SubViewOp>(loc, resultType, src, offsets, sizes,
                                      strides);
 }

 //===----------------------------------------------------------------------===//
 // There might be cases when the `value` stored into a
 // `flow.dispatch.tensor.store` operation is obtained from operation that
 // computes the value (say a `linalg` operation) through a series of `reshapes`,
 // `cast` etc. When trying to reuse the buffer for the result passed in to the
 // dispatch region for these operations, these operations need to be "replayed"
 // in reverse so that the type of the buffer in the operation computing the
 // value matches what is expected.
 //
 // For example,
 // ```mlir
 //   %buffer = hal.interface.binding.subspan .. : tensor<?xf32>
 //   %result = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
 //       outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
 //   %value = linalg.tensor_reshape %result [affine_map<(d0, d1) -> (d0, d1)]
 //       : tensor<?x?xf32> into tensor<?xf32>
 //   flow.dispatch.tensor.store %value, %buffer[..] [..] [..]
 // ```
 //
 // needs to be converted to
 //
 // ```mlir
 //   %buffer = hal.interface.binding.subspan .. : memref<?xf32>
 //   %result = subview %buffer[..] [..] [..] : memref<?xf32>
 //   %value = linalg.reshape %result [affine_map<(d0, d1) -> (d0, d1)]
 //       : memref<?xf32> into memref<?x?xf32>
 //   linalg.matmul ins(%lhs, %rhs : memref<?x?xf32>, memref<?x?xf32>)
 //       outs(%result : memref<?x?xf32>)
 //   flow.dispatch.tensor.store %value, %buffer[..] [..] [..]
 // ```
 //
 // ===----------------------------------------------------------------------===//

 /// For a given store-like `op` that is to be replaced, find the insertion point
 /// in the same block earliest possible when
 /// - the replacement op uses values in `usedValues`, so has to be inserted
 ///   after the ops that define these.
 /// - The op needs to be inserted before `insertBefore` (which is in the same
 /// block). Return nullptr all other times.
 static Operation *getInsertionPointForReplacementStoreOp(
     Operation *op, Operation *insertBefore, ArrayRef<Value> usedValues) {
   if (op->getBlock() != insertBefore->getBlock()) return nullptr;
   Operation *insertAfter = nullptr;
   for (auto value : usedValues) {
     Operation *definingOp = value.getDefiningOp();
     if (!definingOp || definingOp->getBlock() != insertBefore->getBlock())
       continue;
     if (!insertAfter || insertAfter->isBeforeInBlock(definingOp))
       insertAfter = definingOp;
   }
   // All defining ops are outside of the block, so just insert at the start of
   // the block.
   if (!insertAfter) return &(op->getBlock()->front());
   if (insertAfter->isBeforeInBlock(insertBefore))
     return insertAfter->getNextNode();
   return nullptr;
 }

 /// Returns the subview into the buffer that is supposed to be populated with
 /// the `value` of the `flow.dispatch.tensor.store` operation. This can be used
 /// to compute the results in place.
 static Value getSubviewOpForTensorStoreOp(OpBuilder &b, Operation *storeOp,
                                           const BlockAndValueMapping &bvm) {
   SmallVector<Value, 4> operandsOfSubviewOp;
   auto op = cast<OffsetSizeAndStrideOpInterface>(storeOp);
   Value target = TypeSwitch<Operation *, Value>(op)
                      .Case<IREE::Flow::DispatchTensorStoreOp>(
                          [&](auto storeOp) { return storeOp.target(); })
                      .Case<tensor::InsertSliceOp>(
                          [&](auto storeOp) { return storeOp.dest(); })
                      .Default([](Operation *) { return nullptr; });
   if (!target) return nullptr;
   operandsOfSubviewOp.push_back(bvm.lookup(target));
   operandsOfSubviewOp.append(op.offsets().begin(), op.offsets().end());
   operandsOfSubviewOp.append(op.sizes().begin(), op.sizes().end());
   operandsOfSubviewOp.append(op.strides().begin(), op.strides().end());
   Operation *insertBefore = &(*b.getInsertionPoint());
   Operation *insertionPoint = getInsertionPointForReplacementStoreOp(
       op.getOperation(), insertBefore, operandsOfSubviewOp);
   if (!insertionPoint) return nullptr;
   OpBuilder::InsertionGuard g(b);
   Value subview =
       createSubviewOp(b, op.getLoc(), bvm.lookup(target), op.getMixedOffsets(),
                       op.getMixedSizes(), op.getMixedStrides());
   return subview;
 }

 /// Gets the reverse of a
 /// `linalg.tensor_expand_shape`/`linalg.tensor_collapse_shape` op to get a
 /// memref type that can be used for in-place computation of the result of a
 /// dispatch region.
 template <typename TensorReshapeOpTy>
 static Value getReverseOfReshapeOp(OpBuilder &b, TensorReshapeOpTy reshapeOp,
                                    Value resultBuffer) {
   auto memrefType = getMemrefTypeForTensor(
       reshapeOp.getSrcType(), {},
       resultBuffer.getType().cast<MemRefType>().getMemorySpaceAsInt());
   using ReverseReshapeOpTy = typename std::conditional<
       std::is_same<TensorReshapeOpTy, linalg::TensorCollapseShapeOp>::value,
       memref::ExpandShapeOp, memref::CollapseShapeOp>::type;
   return b.create<ReverseReshapeOpTy>(reshapeOp.getLoc(), memrefType,
                                       resultBuffer, reshapeOp.reassociation());
 }

 /// Gets the reverse of a `tensor.cast` op to get a memref type that
 /// can be used for in-place computation of the result of a disaptch region.
 static Value getReverseOfCastOp(OpBuilder &b, tensor::CastOp castOp,
                                 Value resultBuffer) {
   auto memrefType = getMemrefTypeForTensor(
       castOp.source().getType().cast<RankedTensorType>(),
       resultBuffer.getType().cast<MemRefType>().getAffineMaps(),
       resultBuffer.getType().cast<MemRefType>().getMemorySpaceAsInt());
   return b.create<memref::CastOp>(castOp.getLoc(), memrefType, resultBuffer);
 }

 /// Returns a tied result value give the operand. If no such result exists,
 /// returns `nullptr`.
 static Value getTiedResultForOperand(OpOperand &operand,
                                      const BufferizationPlan &plan) {
   for (Value result : operand.getOwner()->getResults()) {
     if (plan.isEquivalent(operand.get(), result)) {
       return result;
     }
   }
   return nullptr;
 }

 /// To perform updates directly into the result buffer, the uses need to be
 /// walked to get to a value already mapped to a buffer or a
 /// `flow.dispatch.tensor.store` operation. For each use, gets the tied result
 /// and follow its uses. The traversed uses and thir tied results are returned
 /// in `traversedUses`.
 static Value walkUseToGetResultBuffer(
     OpBuilder &b, Value value, const BufferizationPlan &plan,
     const BlockAndValueMapping &bvm,
     SmallVectorImpl<std::pair<OpOperand *, Value>> &traversedUses) {
   Operation *user = nullptr;
   while (value.hasOneUse()) {
     OpOperand &use = *value.use_begin();
     user = use.getOwner();
     if (isa<IREE::Flow::DispatchTensorStoreOp>(user)) {
       return getSubviewOpForTensorStoreOp(b, user, bvm);
     }
     value = getTiedResultForOperand(use, plan);
     if (!value) return nullptr;
     traversedUses.push_back(std::make_pair(&use, value));
     if (auto resultBuffer = bvm.lookupOrNull(value)) return resultBuffer;
   }
   return nullptr;
 }

 /// For an operation whose `resultValue` is the result of the dispatch region,
 /// gets the buffer to use to compute the value in-place.
 static Value getInplaceResultBuffer(OpBuilder &b, OpResult resultValue,
                                     const BufferizationPlan &plan,
                                     BlockAndValueMapping &bvm) {
   // Traverse the use-def chains to get the `flow.dispatch.tensor.store`
   // operation keeping track of all the traversed operations. Note that the
   // equivalence set construction should ensure that all operations traversed
   // here have a single use.
   SmallVector<std::pair<OpOperand *, Value>> traversedUses;
   Value resultBuffer =
       walkUseToGetResultBuffer(b, resultValue, plan, bvm, traversedUses);
   if (!resultBuffer) return nullptr;

   DEBUG_WITH_TYPE(DEBUG_TYPE, {
     llvm::dbgs() << "Pair :\n\tTensor :";
     resultValue.getOwner()->print(llvm::dbgs());
     llvm::dbgs() << "\nt\tMemref :";
     resultBuffer.print(llvm::dbgs());
     llvm::dbgs() << "\n";
   });

   // Now replay the instructions that are essentially doing type-conversion, in
   // reverse, to get the type needed for the operation computing the value.
   for (auto &it : llvm::reverse(traversedUses)) {
     Operation *op = it.first->getOwner();
     resultBuffer =
         TypeSwitch<Operation *, Value>(op)
             .Case<scf::IfOp, scf::ForOp, linalg::LinalgOp,
                   linalg_ext::LinalgExtOp, tensor::InsertSliceOp,
                   vector::TransferWriteOp>(
                 [&](auto op) { return resultBuffer; })
             .Case<linalg::TensorCollapseShapeOp, linalg::TensorExpandShapeOp>(
                 [&](auto reshapeOp) {
                   return getReverseOfReshapeOp(b, reshapeOp, resultBuffer);
                 })
             .Case<tensor::CastOp>([&](tensor::CastOp castOp) {
               return getReverseOfCastOp(b, castOp, resultBuffer);
             })
             .Default([&](Operation *) { return nullptr; });
     // TODO(ravishankarm): Maybe this needs to be an error. This should not have
     // happened.
     if (!resultBuffer) return nullptr;
     Value tiedResult = it.second;
     bvm.map(tiedResult, resultBuffer);
     DEBUG_WITH_TYPE(DEBUG_TYPE, {
       llvm::dbgs() << "Pair :\n\tTensor result "
                    << tiedResult.cast<OpResult>().getResultNumber() << " :";
       op->print(llvm::dbgs());
       llvm::dbgs() << "\nt\tMemref :";
       resultBuffer.print(llvm::dbgs());
       llvm::dbgs() << "\n";
     });
   }
   return resultBuffer;
 }

 /// Converts a `tensor.cast` operation into a `memref.cast` operation with the
 /// result aliasing the buffer for the operand.
 static Value getAliasingBufferForResult(OpBuilder &b, tensor::CastOp castOp,
                                         BlockAndValueMapping &bvm) {
   Value inputBuffer = bvm.lookup(castOp.source());
   Value resultTensor = castOp.dest();
   auto outputType = getMemrefTypeForTensor(
       resultTensor.getType().cast<RankedTensorType>(), {},
       inputBuffer.getType().cast<MemRefType>().getMemorySpaceAsInt());
   return b.create<memref::CastOp>(castOp.getLoc(), outputType, inputBuffer);
 }

 /// Returns the subview that indexes into the source of the interface buffer.
 static Value getAliasingBufferForResult(OpBuilder &b,
                                         IREE::Flow::DispatchTensorLoadOp loadOp,
                                         BlockAndValueMapping &bvm) {
   Location loc = loadOp.getLoc();
   Value memref = bvm.lookup(loadOp.source());
   return createSubviewOp(b, loc, memref, loadOp.getMixedOffsets(),
                          loadOp.getMixedSizes(), loadOp.getMixedStrides());
 }

 /// Converts a `linalg.tensor_collapse/expand_shape` operation to a
 /// `linalg.collapse/expand_shape` operation with the result aliasing the buffer
 /// for the operand.
 template <typename TensorReshapeOpTy>
 static Value getAliasingBufferForReshapeResult(OpBuilder &b,
                                                TensorReshapeOpTy op,
                                                BlockAndValueMapping &bvm) {
   Location loc = op.getLoc();
   Value srcTensor = op.src();
   RankedTensorType resultTensorType = op.getResultType();
   Value inputBuffer = bvm.lookup(srcTensor);

   // Create the reshape op.
   MemRefType inputBufferType = inputBuffer.getType().cast<MemRefType>();
   auto reshapeResultType = getMemrefTypeForTensor(
       resultTensorType, {}, inputBufferType.getMemorySpaceAsInt());
   using ReshapeOpTy = typename std::conditional<
       std::is_same<TensorReshapeOpTy, linalg::TensorCollapseShapeOp>::value,
       memref::CollapseShapeOp, memref::ExpandShapeOp>::type;
   Value bufferReshape = b.create<ReshapeOpTy>(loc, reshapeResultType,
                                               inputBuffer, op.reassociation());
   return bufferReshape;
 }

 /// Converts a `subtensor` operation to a `subview` operation.
 static Value getAliasingBufferForResult(OpBuilder &b, tensor::ExtractSliceOp op,
                                         BlockAndValueMapping &bvm) {
   Location loc = op.getLoc();
   Value srcTensor = op.source();
   Value inputBuffer = bvm.lookup(srcTensor);

   ShapedType sourceType = op.getSourceType();
   ShapedType resultType = op.getType();
   SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
   SmallVector<OpFoldResult> sizes = op.getMixedSizes();
   SmallVector<OpFoldResult> strides = op.getMixedStrides();
   MemRefType subViewResultType =
       (resultType.getRank() < sourceType.getRank()
            ? memref::SubViewOp::inferRankReducedResultType(
                  resultType.getRank(), inputBuffer.getType().cast<MemRefType>(),
                  offsets, sizes, strides)
                  .cast<MemRefType>()
            : MemRefType());
   return b.create<memref::SubViewOp>(loc, subViewResultType, inputBuffer,
                                      offsets, sizes, strides);
 }

 /// Returns output buffers that aliases inputs.
 static SmallVector<Value> getAliasingBuffersForResult(
     scf::ForOp scfFor, BlockAndValueMapping &bvm) {
   SmallVector<Value> aliasedBuffers(scfFor.results().size(), nullptr);
   for (int i = 0; i < scfFor.results().size(); ++i) {
     Value inputTensor = scfFor.initArgs()[i];
     if (!inputTensor.getType().isa<RankedTensorType>()) continue;
     Value inputBuffer = bvm.lookup(inputTensor);
     aliasedBuffers[i] = inputBuffer;
   }
   return aliasedBuffers;
 }

 /// Returns a `memref` for every result that aliases the buffer for one of its
 /// operands. Returns the memref of the right shape/type based on the operation.
 static SmallVector<Value, 4> getAliasingBuffersForResults(
     OpBuilder &b, Operation *op, BlockAndValueMapping &bvm) {
   return TypeSwitch<Operation *, SmallVector<Value, 4>>(op)
       .Case<IREE::Flow::DispatchTensorLoadOp, tensor::ExtractSliceOp,
             tensor::CastOp>([&](auto singleResultOp) -> SmallVector<Value, 4> {
         return {getAliasingBufferForResult(b, singleResultOp, bvm)};
       })
       .Case<linalg::TensorCollapseShapeOp, linalg::TensorExpandShapeOp>(
           [&](auto reshapeOp) -> SmallVector<Value, 4> {
             return {getAliasingBufferForReshapeResult(b, reshapeOp, bvm)};
           })
       .Case<scf::ForOp>([&](auto scfFor) -> SmallVector<Value> {
         return getAliasingBuffersForResult(scfFor, bvm);
       })
       .Default([&](Operation *op) -> SmallVector<Value, 4> {
         return SmallVector<Value, 4>(op->getNumResults(), nullptr);
       });
 }

 /// For a result value, gets the operand that is tied with it. If no such
 /// operand exists, returns `nullptr`.
 static Value getTiedOperandForResult(OpResult result,
                                      const BufferizationPlan &plan) {
   for (Value operand : result.getOwner()->getOperands()) {
     if (plan.isEquivalent(operand, result)) {
       return operand;
     }
   }
   return nullptr;
 }
 static bool hasTiedOperandForResult(OpResult result,
                                     const BufferizationPlan &plan) {
   return static_cast<bool>(getTiedOperandForResult(result, plan));
 }

 /// Computes the `memrefs` to use for the result of an operation based on
 /// - If the result has a tied operand reuse the buffer for the tied operand (or
 ///   an alias of it) as the buffer for the result. The `alaisingBuffer` vector
 ///   is expected to be as large as the number of results.
 /// - If the result has no tied operands, the corresponding position in the
 ///   `aliasingBuffer` list must be `nullptr`.
 /// - If the result is in the same equivalence set as the result of the dispatch
 ///   region (i.e. `value` operand of a `flow.dispatch.tensor.store`) then
 ///   return an alias/view of the buffer passed into the dispatch region to
 ///   store the results.
 /// - Lastly, allocate a temporary buffer for the result using the passed
 ///   allocation function.
 static LogicalResult getOrAllocateResultBuffers(
     OpBuilder &b, Operation *op, ArrayRef<Value> aliasingBuffers,
     BlockAndValueMapping &bvm, BufferizationPlan &plan,
     WorkgroupMemoryAllocationFn allocationFn) {
   assert(aliasingBuffers.size() == op->getNumResults());
   auto results = op->getResults();
   for (auto result : llvm::enumerate(results)) {
     if (!result.value().getType().isa<RankedTensorType>() ||
         bvm.contains(result.value())) {
       continue;
     }
     Value buffer;

     if (aliasingBuffers[result.index()] &&
         hasTiedOperandForResult(result.value(), plan)) {
       buffer = aliasingBuffers[result.index()];
     }
     if (!buffer && plan.isInStoreSet(result.value())) {
       buffer = getInplaceResultBuffer(b, result.value(), plan, bvm);
     }
     if (!buffer) {
       buffer = allocateBufferForResult(b, op, result.index(), allocationFn);
     }
     if (!buffer) {
       return op->emitError("unable to get result buffer for op");
     }
     bvm.map(result.value(), buffer);
     DEBUG_WITH_TYPE(DEBUG_TYPE, {
       llvm::dbgs() << "Pair :\n\tTensor result " << result.index() << ":";
       op->print(llvm::dbgs());
       llvm::dbgs() << "\nt\tMemref :";
       buffer.print(llvm::dbgs());
       llvm::dbgs() << "\n";
     });
   }
   return success();
 }

 /// Convenience wrapper around core allocation function for the case where the
 /// alias is the buffer for the result directly.
 static LogicalResult getOrAllocateResultBuffers(
     OpBuilder &b, Operation *op, BlockAndValueMapping &bvm,
     BufferizationPlan &plan, WorkgroupMemoryAllocationFn allocationFn) {
   auto aliasingBuffers = llvm::to_vector<4>(
       llvm::map_range(op->getResults(), [&](OpResult result) {
         Value tiedOperand = getTiedOperandForResult(result, plan);
         return tiedOperand ? bvm.lookupOrNull(tiedOperand) : tiedOperand;
       }));
   return getOrAllocateResultBuffers(b, op, aliasingBuffers, bvm, plan,
                                     allocationFn);
 }

 /// Generic conversion pattern that matches any linalg::LinalgOp. This avoids
 /// template instantiating one pattern for each linalg::LinalgOp. The method
 /// expects all operands and results have already been mapped to memrefs.
 template <typename OpTy>
 static LogicalResult convertAnyLinalgOp(
     OpBuilder &b, OpTy op, BlockAndValueMapping &bvm, BufferizationPlan &plan,
     WorkgroupMemoryAllocationFn allocationFn) {
   // Skip linalg ops inserted by this pass.
   if (op.hasBufferSemantics()) return success();

   Location loc = op.getLoc();
   SmallVector<Value, 2> newInputBuffers;
   newInputBuffers.reserve(op.getNumInputs());
   for (OpOperand *opOperand : op.getInputOperands()) {
     if (op.isScalar(opOperand)) {
       newInputBuffers.push_back(opOperand->get());
       continue;
     }
     // For `linalg.poolin_*` ops, the input might be from a
     // `linalg.init_tensor`. In such cases, the `BlockAndValueMapping` wont have
     // a mapping for the buffer. Allocate a buffer for these.
     Value inputBuffer = bvm.lookupOrNull(opOperand->get());
     if (!inputBuffer) {
       OpResult definingOpResult = opOperand->get().dyn_cast<OpResult>();
       if (!definingOpResult) return failure();
       inputBuffer = allocateBufferForResult(b, definingOpResult.getOwner(),
                                             definingOpResult.getResultNumber(),
                                             allocationFn);
     }
     newInputBuffers.push_back(inputBuffer);
   }
   SmallVector<Value, 2> newOutputBuffers;
   auto results = op.getOperation()->getResults();
   auto outputs = op.getOutputOperands();
   for (auto it : llvm::zip(results, outputs)) {
     Value resultTensor = std::get<0>(it);
     Value resultBuffer = bvm.lookup(resultTensor);

     OpOperand *outOperand = std::get<1>(it);
     Value outTensor = outOperand->get();
     Value outBuffer = bvm.lookupOrNull(outTensor);
     if (outBuffer && !plan.isEquivalent(outTensor, resultTensor) &&
         op.payloadUsesValueFromOperand(outOperand)) {
       b.create<linalg::CopyOp>(loc, outBuffer, resultBuffer);
     }
     newOutputBuffers.push_back(resultBuffer);
   }

   SmallVector<Value, 8> newOperands(newInputBuffers.begin(),
                                     newInputBuffers.end());
   newOperands.append(newOutputBuffers.begin(), newOutputBuffers.end());
   op.clone(b, loc, {}, newOperands);
   return success();
 }

 /// Constants that return tensor types can be handled natively by the
 /// backends. Here just provide a cast to memref to bridge the gap from tensors
 /// to memrefs.
 static LogicalResult convertConstantOp(OpBuilder &b, ConstantOp constantOp,
                                        BlockAndValueMapping &bvm) {
   Value result = constantOp.getResult();
   assert(!bvm.lookupOrNull(result));
   RankedTensorType tensorType = result.getType().dyn_cast<RankedTensorType>();
   if (!tensorType) return success();
   OpBuilder::InsertionGuard g(b);
   b.setInsertionPointAfter(constantOp);
   auto memrefType = getMemrefTypeForTensor(tensorType);
   Value memref =
       b.create<memref::BufferCastOp>(constantOp.getLoc(), memrefType, result);
   bvm.map(result, memref);
   return success();
 }

 static LogicalResult convertDispatchTieShapeOp(
     OpBuilder &b, IREE::Flow::DispatchTieShapeOp shapeOp,
     BlockAndValueMapping &bvm) {
   if (Value v = bvm.lookupOrNull(shapeOp.operand())) {
     auto tieShapeOp = b.create<Shape::TieShapeOp>(shapeOp.getLoc(), v.getType(),
                                                   v, shapeOp.shape());
     bvm.map(shapeOp.getResult(), tieShapeOp.getResult());
   }
   return success();
 }

 /// Converts a `tensor.extract` operation into a `load`.
 static LogicalResult convertTensorExtractOp(OpBuilder &b, tensor::ExtractOp op,
                                             const BlockAndValueMapping &bvm) {
   OpBuilder::InsertionGuard g(b);
   b.setInsertionPoint(op);
   Value inputBuffer = bvm.lookup(op.tensor());
   Value load =
       b.createOrFold<memref::LoadOp>(op.getLoc(), inputBuffer, op.indices());
   // Since the value is the scalar, and `bvm` is used to only track tensor ->
   // memref mappings, just replace the uses directly.
   op.result().replaceAllUsesWith(load);
   return success();
 }

 /// Converts a `flow.dispatch.tensor.store` operation to memrefs. If the `value`
 /// and `target` are in the same equivalent set, then there is nothing to do. If
 /// no create a subview into the result buffer and copy the `value`.
 static LogicalResult convertInterfaceStoreTensorOp(
     OpBuilder &b, IREE::Flow::DispatchTensorStoreOp storeOp,
     BlockAndValueMapping &bvm, BufferizationPlan &plan) {
   if (plan.isEquivalent(storeOp.target(), storeOp.value())) {
     return success();
   }
   OpBuilder::InsertionGuard g(b);
   b.setInsertionPoint(storeOp);
   Value storeTo = bvm.lookup(storeOp.target());
   Value storeFrom = bvm.lookup(storeOp.value());
   Value subview =
       createSubviewOp(b, storeOp.getLoc(), storeTo, storeOp.getMixedOffsets(),
                       storeOp.getMixedSizes(), storeOp.getMixedStrides());

   b.create<linalg::CopyOp>(storeOp->getLoc(), storeFrom, subview);
   return success();
 }

 /// Converts a `tensor.insert_slice` operation to buffers by
 /// - Allocating a buffer for the result (if needed), and copying the
 ///   destination value into this buffer.
 /// - Copying the source values into a subview of the result buffer.
 static LogicalResult convertSubTensorInsertOp(OpBuilder &b,
                                               tensor::InsertSliceOp op,
                                               BlockAndValueMapping &bvm,
                                               BufferizationPlan &plan) {
   Location loc = op.getLoc();
   Value result = op.getResult();
   ShapedType resultType = op.getType();
   Value resultBuffer = bvm.lookup(result);

   // If `dest` and `result` are not equivalent, need a copy for that.
   Value dest = op.dest();
   if (!plan.isEquivalent(dest, result)) {
     Value destBuffer = bvm.lookup(dest);
     b.create<linalg::CopyOp>(loc, destBuffer, resultBuffer);
   }

   Value source = op.source();
   if (plan.isEquivalent(source, dest)) {
     return success();
   }

   // Copy from the source to the result subview.
   ShapedType sourceType = op.getSourceType();
   Value sourceBuffer = bvm.lookup(source);
   SmallVector<OpFoldResult> offsets = op.getMixedOffsets();
   SmallVector<OpFoldResult> sizes = op.getMixedSizes();
   SmallVector<OpFoldResult> strides = op.getMixedStrides();
   MemRefType subViewResultType =
       (sourceType.getRank() < resultType.getRank()
            ? memref::SubViewOp::inferRankReducedResultType(
                  sourceType.getRank(),
                  resultBuffer.getType().cast<MemRefType>(), offsets, sizes,
                  strides)
                  .cast<MemRefType>()
            : MemRefType());
   Value subViewOp = createSubviewOp(b, loc, resultBuffer, offsets, sizes,
                                     strides, subViewResultType);
   b.create<linalg::CopyOp>(loc, sourceBuffer, subViewOp);
   return success();
 }

 /// Converts a `tensor.insert` operations into a `memref.store`.
 static LogicalResult convertTensorInsertOp(OpBuilder &b, tensor::InsertOp op,
                                            BlockAndValueMapping &bvm,
                                            BufferizationPlan &plan) {
   Location loc = op.getLoc();
   Value result = op.result();
   Value resultBuffer = bvm.lookup(result);
   if (!plan.isEquivalent(op.dest(), result)) {
     Value destBuffer = bvm.lookup(op.dest());
     b.create<linalg::CopyOp>(loc, destBuffer, resultBuffer);
   }

   b.create<memref::StoreOp>(loc, op.scalar(), resultBuffer, op.indices());
   return success();
 }

 /// Converts a vector.transfer_write op to use memref operands for source.
 static LogicalResult convertVectorTransferWriteOp(OpBuilder &b,
                                                   vector::TransferWriteOp op,
                                                   BlockAndValueMapping &bvm,
                                                   BufferizationPlan &plan) {
   Location loc = op.getLoc();
   Value result = op.result();
   RankedTensorType resultType = result.getType().dyn_cast<RankedTensorType>();
   if (!resultType) return success();
   Value resultBuffer = bvm.lookup(result);

   if (!plan.isEquivalent(op.source(), result)) {
     Value destBuffer = bvm.lookup(op.source());
     b.create<linalg::CopyOp>(loc, destBuffer, resultBuffer);
   }

   // Create a new vector.transfer_write operation without a result value.
   b.create<vector::TransferWriteOp>(
       loc, op.vector(), resultBuffer, op.indices(), op.permutation_map(),
       op.mask(), op.in_bounds() ? *op.in_bounds() : ArrayAttr());
   return success();
 }

 static LogicalResult convertScfForOp(OpBuilder &b, scf::ForOp forOp,
                                      BlockAndValueMapping &bvm,
                                      BufferizationPlan &plan) {
   auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
   Location loc = forOp.getLoc();
   for (auto arg : llvm::enumerate(forOp.getRegionIterArgs())) {
     if (!arg.value().getType().isa<RankedTensorType>()) continue;
     Value resultTensor = forOp.getResult(arg.index());
     Value resultBuffer = bvm.lookup(resultTensor);
     OpOperand &initOperand = forOp.getOpOperandForRegionIterArg(arg.value());
     Value yieldOperand = yieldOp.getOperand(arg.index());
     bvm.map(arg.value(), resultBuffer);
     bvm.map(yieldOperand, resultBuffer);
     if (!plan.isEquivalent(arg.value(), initOperand.get())) {
       Value initBuffer = bvm.lookup(initOperand.get());
       b.create<linalg::CopyOp>(loc, initBuffer, resultBuffer);
     }
   }
   return success();
 }

 static LogicalResult convertScfIfOp(OpBuilder &b, scf::IfOp ifOp,
                                     BlockAndValueMapping &bvm,
                                     BufferizationPlan &plan) {
   auto thenYieldOp = ifOp.thenYield();
   auto elseYieldOp = ifOp.elseYield();
   for (auto result : llvm::enumerate(ifOp.getResults())) {
     Value resultValue = result.value();
     if (!resultValue.getType().isa<RankedTensorType>()) continue;
     Value resultBuffer = bvm.lookup(resultValue);
     Value thenYield = thenYieldOp->getOperand(result.index());
     Value elseYield = elseYieldOp->getOperand(result.index());
     bvm.map(thenYield, resultBuffer);
     bvm.map(elseYield, resultBuffer);
   }
   return success();
 }

 /// If the alias of the buffer for an input oeprand cannot be used for the
 /// "tied" results, need to do an explicit copy of the memory pointed to by the
 /// aliased buffer into the buffer assigned to the result.
 static void copyFromAliasingBufferToResultBuffer(
     OpBuilder &b, Location loc, ArrayRef<Value> tiedOperands,
     ArrayRef<Value> tiedResults, ArrayRef<Value> aliasingBuffers,
     BlockAndValueMapping &bvm, BufferizationPlan &plan) {
   for (auto result : enumerate(tiedResults)) {
     Value operand = tiedOperands[result.index()];
     if (!plan.isEquivalent(result.value(), operand)) {
       b.create<linalg::CopyOp>(loc, aliasingBuffers[result.index()],
                                bvm.lookup(result.value()));
     }
   }
 }

 /// Returns the static/dynamic mixed sizes of the memref.
 static SmallVector<OpFoldResult> getMemrefSizes(OpBuilder &b, Location loc,
                                                 Value memref) {
   auto inputShape = memref.getType().cast<ShapedType>().getShape();
   SmallVector<OpFoldResult> sizeMixedValues;
   for (int64_t i = 0; i < inputShape.size(); ++i) {
     if (inputShape[i] == ShapedType::kDynamicSize) {
       Value dim = b.create<memref::DimOp>(loc, memref, i);
       sizeMixedValues.push_back(dim);
     } else {
       sizeMixedValues.push_back(b.getI64IntegerAttr(inputShape[i]));
     }
   }
   return sizeMixedValues;
 }

 static LogicalResult convertPadTensorOp(OpBuilder &b,
                                         linalg::PadTensorOp padTensorOp,
                                         BlockAndValueMapping &bvm) {
   auto inputTensor = padTensorOp.source();
   auto inputMemref = bvm.lookup(inputTensor);

   auto loc = padTensorOp.getLoc();

   auto resultPaddedBuffer = bvm.lookup(padTensorOp.result());

   // Get padding value and fill the result buffer.
   linalg::YieldOp yeildOp =
       *padTensorOp.region().getOps<linalg::YieldOp>().begin();
   Value paddingValue = yeildOp.values()[0];

   auto constOp = paddingValue.getDefiningOp<ConstantOp>();
   if (!constOp) {
     return padTensorOp.emitError(
         "Converting linalg.pad_tensor with non-constant padding value");
   }
   if (constOp.getValue().isa<DenseElementsAttr>()) {
     return padTensorOp.emitError(
         "Converting linalg.pad_tensor with non-scalar constant padding "
         "value");
   }

   b.create<linalg::FillOp>(loc, paddingValue, resultPaddedBuffer);

   // Get the interior region.
   SmallVector<OpFoldResult> sizeMixedValues =
       getMemrefSizes(b, loc, inputMemref);
   SmallVector<OpFoldResult> strides(
       inputMemref.getType().cast<ShapedType>().getRank(),
       b.getI64IntegerAttr(1));

   auto resultSubView = b.create<memref::SubViewOp>(loc, resultPaddedBuffer,
                                                    padTensorOp.getMixedLowPad(),
                                                    sizeMixedValues, strides);
   // Copy to the interior region.
   b.create<linalg::CopyOp>(loc, inputMemref, resultSubView);
   return success();
 }

 namespace {
 /// Pass to convert from tensor based ops to memref based ops.
 class LinalgBufferizePass : public LinalgBufferizeBase<LinalgBufferizePass> {
  public:
   LinalgBufferizePass(WorkgroupMemoryAllocationFn fn) : allocationFn(fn) {}
   void getDependentDialects(DialectRegistry &registry) const override {
     registry
         .insert<IREE::Util::UtilDialect, linalg::LinalgDialect,
                 memref::MemRefDialect, scf::SCFDialect, StandardOpsDialect>();
   }
   void runOnOperation() override;

  private:
   WorkgroupMemoryAllocationFn allocationFn;
 };
 }  // namespace

 void LinalgBufferizePass::runOnOperation() {
   BufferizationPlan plan;
   FuncOp funcOp = getOperation();
   if (failed(analyseOperations(funcOp, plan))) {
     return signalPassFailure();
   }
   if (funcOp
           .walk([&](IREE::Flow::DispatchTensorStoreOp storeOp) -> WalkResult {
             return analyseInterfaceStoreTensorOp(storeOp, plan);
           })
           .wasInterrupted()) {
     return signalPassFailure();
   }

   MLIRContext *context = &getContext();
   OpBuilder b(context);

   BlockAndValueMapping bvm;

   // First go over all hal.interface.binding.subspan ops and create counterparts
   // working with memrefs.
   funcOp.walk([&](IREE::HAL::InterfaceBindingSubspanOp op) {
     auto shapedType =
         op.getResult().getType().dyn_cast<IREE::Flow::DispatchTensorType>();
     if (!shapedType || !shapedType.hasRank()) return;
     OpBuilder::InsertionGuard g(b);
     b.setInsertionPoint(op);
     // Just change the result type of the InterfaceBindingSubspanOp to form
     // the base buffer.
     auto tensorType =
         op.result().getType().cast<IREE::Flow::DispatchTensorType>();
     auto memRefType = getMemrefTypeForTensor(tensorType);
     auto baseBuffer = b.create<IREE::HAL::InterfaceBindingSubspanOp>(
         op->getLoc(), memRefType, op.binding(), op.byte_offset(),
         op.byte_length());
     bvm.map(op, baseBuffer);
   });

   // Visit all the operations that return `tensor`s and convert them to using
   // `memref`s.
   auto convertTensorProducingOps = [&](Operation *op) -> WalkResult {
     return TypeSwitch<Operation *, LogicalResult>(op)
         .Case<ConstantOp>([&](ConstantOp constantOp) {
           return convertConstantOp(b, constantOp, bvm);
         })
         .Case<IREE::Flow::DispatchTensorStoreOp>(
             [&](IREE::Flow::DispatchTensorStoreOp storeOp) {
               return convertInterfaceStoreTensorOp(b, storeOp, bvm, plan);
             })
         .Case<IREE::Flow::DispatchTieShapeOp>(
             [&](IREE::Flow::DispatchTieShapeOp shapeOp) {
               return convertDispatchTieShapeOp(b, shapeOp, bvm);
             })
         .Case<scf::ForOp>([&](scf::ForOp forOp) {
           if (failed(getOrAllocateResultBuffers(b, forOp, bvm, plan,
                                                 allocationFn))) {
             return failure();
           }
           return convertScfForOp(b, forOp, bvm, plan);
         })
         .Case<scf::IfOp>([&](scf::IfOp ifOp) {
           if (failed(getOrAllocateResultBuffers(b, ifOp, bvm, plan,
                                                 allocationFn))) {
             return failure();
           }
           return convertScfIfOp(b, ifOp, bvm, plan);
         })
         .Case<IREE::Flow::DispatchTensorLoadOp, linalg::TensorCollapseShapeOp,
               linalg::TensorExpandShapeOp, tensor::ExtractSliceOp,
               tensor::CastOp>([&](auto aliasingOp) {
           auto aliasingBuffers =
               getAliasingBuffersForResults(b, aliasingOp, bvm);
           if (failed(getOrAllocateResultBuffers(b, aliasingOp, aliasingBuffers,
                                                 bvm, plan, allocationFn))) {
             return failure();
           }
           copyFromAliasingBufferToResultBuffer(
               b, aliasingOp->getLoc(), aliasingOp->getOperand(0),
               aliasingOp->getResult(0), aliasingBuffers, bvm, plan);
           return success();
         })
         .Case<linalg::PadTensorOp>([&](linalg::PadTensorOp padTensorOp) {
           if (failed(getOrAllocateResultBuffers(b, padTensorOp, bvm, plan,
                                                 allocationFn))) {
             return failure();
           }
           return convertPadTensorOp(b, padTensorOp, bvm);
         })
         .Case<linalg::LinalgOp, linalg_ext::LinalgExtOp>([&](auto op) {
           if (failed(
                   getOrAllocateResultBuffers(b, op, bvm, plan, allocationFn))) {
             return failure();
           }
           return convertAnyLinalgOp(b, op, bvm, plan, allocationFn);
         })
         .Case<tensor::InsertSliceOp>(
             [&](tensor::InsertSliceOp subTensorInsertOp) {
               if (failed(getOrAllocateResultBuffers(b, subTensorInsertOp, bvm,
                                                     plan, allocationFn))) {
                 return failure();
               }
               return convertSubTensorInsertOp(b, subTensorInsertOp, bvm, plan);
             })
         .Case<tensor::InsertOp>([&](tensor::InsertOp tensorInsertOp) {
           if (failed(getOrAllocateResultBuffers(b, tensorInsertOp, bvm, plan,
                                                 allocationFn))) {
             return failure();
           }
           return convertTensorInsertOp(b, tensorInsertOp, bvm, plan);
         })
         .Case<vector::TransferWriteOp>(
             [&](vector::TransferWriteOp transferWriteOp) {
               if (failed(getOrAllocateResultBuffers(b, transferWriteOp, bvm,
                                                     plan, allocationFn))) {
                 return failure();
               }
               return convertVectorTransferWriteOp(b, transferWriteOp, bvm,
                                                   plan);
             })
         .Default([&](Operation *op) { return success(); });
   };
   auto walkResult =
       funcOp.walk<WalkOrder::PreOrder>([&](Operation *op) -> WalkResult {
         b.setInsertionPoint(op);
         return convertTensorProducingOps(op);
       });
   if (walkResult.wasInterrupted()) {
     return signalPassFailure();
   }

   // Lastly visit the non-tensor return operations that still use `tensor`
   // values. These need to be updated to use the corresponding `memref` values,
   // but dont need to update the block-and-value mapping.
   auto convertNonTensorProducingOps = [&](Operation *op) -> LogicalResult {
     return TypeSwitch<Operation *, LogicalResult>(op)
         .Case<tensor::ExtractOp>([&](tensor::ExtractOp op) {
           return convertTensorExtractOp(b, op, bvm);
         })
         .Case<vector::TransferReadOp>([&](auto op) {
           for (unsigned i : llvm::seq<unsigned>(0, op->getNumOperands())) {
             Value operand = op->getOperand(i);
             if (operand.getType().isa<RankedTensorType>()) {
               Value remappedVal = bvm.lookupOrNull(operand);
               if (remappedVal) op->setOperand(i, remappedVal);
             }
           }
           return success();
         })
         .Case<tensor::DimOp>([&](tensor::DimOp dimOp) {
           Value operand = dimOp.source();
           Value remappedVal = bvm.lookupOrNull(operand);
           Value newDimOp = b.create<memref::DimOp>(dimOp.getLoc(), remappedVal,
                                                    dimOp.index());
           dimOp.replaceAllUsesWith(newDimOp);
           return success();
         })
         .Case<scf::ForOp>([&](scf::ForOp forOp) {
           // To canonicalize the `scf.for` tensor result/operand/yield value
           // away, forward the init argument to the yeild of the loop.
           auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
           for (auto arg : llvm::enumerate(forOp.getIterOperands())) {
             if (!arg.value().getType().isa<RankedTensorType>()) continue;
             yieldOp.setOperand(arg.index(), arg.value());
           }
           return success();
         })
         .Case<IREE::Flow::DispatchTensorStoreOp>([&](auto op) {
           op.erase();
           return success();
         })
         .Default([&](Operation *op) { return success(); });
   };

   walkResult = funcOp.walk([&](Operation *op) -> WalkResult {
     b.setInsertionPoint(op);
     return convertNonTensorProducingOps(op);
   });
   if (walkResult.wasInterrupted()) {
     return signalPassFailure();
   }
 }

 static Value defaultAllocationFn(OpBuilder &builder, Location loc,
                                  ArrayRef<int64_t> staticShape,
                                  Type elementType,
                                  ArrayRef<Value> dynamicSizes) {
   auto allocationType = MemRefType::get(staticShape, elementType);
   return builder.create<memref::AllocOp>(loc, allocationType, dynamicSizes);
 }

 std::unique_ptr<OperationPass<FuncOp>> createLinalgBufferizePass(
     WorkgroupMemoryAllocationFn allocationFn) {
   return std::make_unique<LinalgBufferizePass>(
       allocationFn ? allocationFn : defaultAllocationFn);
 }

 void addLinalgBufferizePasses(OpPassManager &passManager,
                               WorkgroupMemoryAllocationFn allocationFn) {
   passManager.addNestedPass<FuncOp>(createLinalgBufferizePass(allocationFn));
   passManager.addPass(memref::createResolveShapedTypeResultDimsPass());
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
   passManager.addNestedPass<FuncOp>(createCSEPass());
   passManager.addNestedPass<FuncOp>(createCleanupBufferAllocViewPass());
   // passManager.addPass(createBufferHoistingPass());
   // TODO(nicolasvasilache): bug in buffer loop hoisting with
   // dynamic_linalg_matmul_on_tensors_fuse_0.mlir
   // passManager.addPass(createBufferLoopHoistingPass());
 }

 }  // namespace iree_compiler
 }  // namespace mlir