Convert func->util as part of input conversion. (#16411) We now use `util.func` in place of `func.func` in all host code in the compiler. flow/stream/hal executables continue to use `func.func` as before for compatibility with upstream code and the benefits of the util ops are fewer. Most code is still written against the function/callable/call op interfaces so that we support initializers and other future function types we may add. All tests have been updated to use `util.func` for consistency even if the pass does still work with `func.func`. There's a few TODOs around better supporting tied function operands in IPO and other passes but we aren't currently ever producing functions with tied operands so they are hacked to bail in cases where they are (IPO doesn't act on functions/calls with tied operands, etc).

commit: 045bca1e213ea9ab4210277776430b497b64be10 [log] [tgz]
author: Ben Vanik <ben.vanik@gmail.com> Thu Feb 15 15:53:12 2024 -0800
committer: GitHub <noreply@github.com> Thu Feb 15 23:53:12 2024 +0000
tree: daa0baaa2d024f85cbb9fb1f9aed0246388d968f
parent: 1ee6007a2c07b1b251bc4393e6778b13a278d33c [diff]
diff --git a/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/test/auto_input_conversion.mlir b/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/test/auto_input_conversion.mlir
index 6776bcb..08b556b 100644
--- a/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/test/auto_input_conversion.mlir
+++ b/compiler/plugins/input/StableHLO/stablehlo-iree/Conversion/test/auto_input_conversion.mlir

@@ -2,7 +2,7 @@
 
 // Check that the auto input conversion pipeline uses this plugin.
 
-// CHECK-LABEL: func.func @simple_add_stablehlo
+// CHECK-LABEL: util.func public @simple_add_stablehlo
 // CHECK:  arith.addi
 func.func @simple_add_stablehlo(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> tensor<2x2xi32> {
   %0 = stablehlo.add %arg0, %arg1 : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>

diff --git a/compiler/plugins/input/TOSA/tosa-iree/InputConversion/test/auto_input_conversion.mlir b/compiler/plugins/input/TOSA/tosa-iree/InputConversion/test/auto_input_conversion.mlir
index 957f0d3..145f2d9 100644
--- a/compiler/plugins/input/TOSA/tosa-iree/InputConversion/test/auto_input_conversion.mlir
+++ b/compiler/plugins/input/TOSA/tosa-iree/InputConversion/test/auto_input_conversion.mlir

@@ -2,7 +2,7 @@
 
 // Check that the auto input conversion pipeline uses this plugin.
 
-// CHECK-LABEL: func.func @simple_add_tosa
+// CHECK-LABEL: util.func public @simple_add_tosa
 // CHECK:  arith.addi
 func.func @simple_add_tosa(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>) -> tensor<2x2xi32> {
   %0 = tosa.add %arg0, %arg1 : (tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<2x2xi32>

diff --git a/compiler/plugins/input/Torch/torch-iree/InputConversion/test/auto_input_conversion.mlir b/compiler/plugins/input/Torch/torch-iree/InputConversion/test/auto_input_conversion.mlir
index fc3f57e..9256519 100644
--- a/compiler/plugins/input/Torch/torch-iree/InputConversion/test/auto_input_conversion.mlir
+++ b/compiler/plugins/input/Torch/torch-iree/InputConversion/test/auto_input_conversion.mlir

@@ -2,7 +2,7 @@
 
 // Check that the auto input conversion pipeline uses this plugin.
 
-// CHECK-LABEL: func.func @simple_add_torch
+// CHECK-LABEL: util.func public @simple_add_torch
 // CHECK:  arith.addf
 func.func @simple_add_torch(%arg0: !torch.vtensor<[2],f32>, %arg1: !torch.vtensor<[2],f32>) -> !torch.vtensor<[2],f32> {
   %int1 = torch.constant.int 1
@@ -12,7 +12,7 @@
 
 // -----
 
-// CHECK-LABEL: func.func @simple_add_onnx
+// CHECK-LABEL: util.func public @simple_add_onnx
 // CHECK:  arith.addi
 func.func @simple_add_onnx(%arg0: !torch.vtensor<[],si64>, %arg1: !torch.vtensor<[],si64>) -> !torch.vtensor<[],si64> attributes {torch.onnx_meta.ir_version = 8 : si64, torch.onnx_meta.opset_version = 17 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "2.1.0"} {
   %0 = torch.operator "onnx.Add"(%arg0, %arg1) : (!torch.vtensor<[],si64>, !torch.vtensor<[],si64>) -> !torch.vtensor<[],si64>

diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Bindings/Native/Transforms/BUILD.bazel
index 33594a7..599c664 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/BUILD.bazel
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/BUILD.bazel

@@ -29,7 +29,6 @@
         "//compiler/src/iree/compiler/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineUtils",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",

diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Bindings/Native/Transforms/CMakeLists.txt
index 66617bf..571299e 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/CMakeLists.txt

@@ -22,7 +22,6 @@
   DEPS
     LLVMSupport
     MLIRAffineUtils
-    MLIRFuncDialect
     MLIRFunctionInterfaces
     MLIRIR
     MLIRPass

diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/ConvertStreamableOps.cpp b/compiler/src/iree/compiler/Bindings/Native/Transforms/ConvertStreamableOps.cpp
index c8de124..385b836 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/ConvertStreamableOps.cpp
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/ConvertStreamableOps.cpp

@@ -7,7 +7,8 @@
 #include "iree/compiler/Bindings/Native/Transforms/Passes.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowDialect.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -80,7 +81,7 @@
 // Converts a func.func with the iree.abi.streamable attribute into a flow.func
 // and fixes all func.call ops to be flow.call across the module.
 static std::optional<StreamableFunc>
-convertStreamableFunc(mlir::ModuleOp moduleOp, func::FuncOp funcOp,
+convertStreamableFunc(mlir::ModuleOp moduleOp, IREE::Util::FuncOp funcOp,
                       SymbolTable &symbolTable) {
   OpBuilder moduleBuilder(funcOp);
   auto functionType = funcOp.getFunctionType();
@@ -137,8 +138,18 @@
     }
   }
 
+  bool anyTiedOperands = false;
   streamableFunc.tiedOperands.resize(functionType.getNumResults(),
                                      IREE::Util::TiedOpInterface::kUntiedIndex);
+  if (auto tiedOperandsAttr = funcOp.getTiedOperandsAttr()) {
+    for (auto [resultIndex, tiedAttr] : llvm::enumerate(
+             funcOp.getTiedOperandsAttr().getAsRange<IntegerAttr>())) {
+      if (tiedAttr.getInt() != IREE::Util::TiedOpInterface::kUntiedIndex) {
+        streamableFunc.tiedOperands[resultIndex] = tiedAttr.getInt();
+        anyTiedOperands = true;
+      }
+    }
+  }
   SmallVector<DictionaryAttr> funcResAttrs;
   for (auto [i, resultType] : llvm::enumerate(functionType.getResults())) {
     // Tensor results need to have their dynamic dimensions specified.
@@ -157,8 +168,8 @@
     if (auto oldResAttrs = funcOp.getResultAttrDict(i)) {
       // First check if the result is tied to an argument.
       // We can use this to source the initial set of dynamic dimensions.
-      if (auto tiedAttr = oldResAttrs.getAs<IntegerAttr>("iree.abi.tied")) {
-        streamableFunc.tiedOperands[i] = tiedAttr.getInt();
+      int64_t tiedIndex = streamableFunc.tiedOperands[i];
+      if (tiedIndex != IREE::Util::TiedOpInterface::kUntiedIndex) {
         if (!streamableFunc.resultDimsFunc &&
             shapedType == functionType.getInput(i)) {
           // Tied types match and we can infer the shape from that. This may
@@ -195,8 +206,7 @@
 
       // Pass-through all other attrs we don't care about.
       for (auto resAttr : oldResAttrs) {
-        if (resAttr.getName() == "iree.abi.tied" ||
-            resAttr.getName() == "iree.abi.dims") {
+        if (resAttr.getName() == "iree.abi.dims") {
           continue;
         }
         newResAttrs.push_back(resAttr);
@@ -221,10 +231,13 @@
   }
 
   // Create the new streamable flow.func op at the same place as the original.
+  auto tiedOperandsAttr =
+      anyTiedOperands
+          ? moduleBuilder.getIndexArrayAttr(streamableFunc.tiedOperands)
+          : ArrayAttr{};
   streamableFunc.funcOp = moduleBuilder.create<IREE::Flow::FuncOp>(
-      funcOp.getLoc(), funcOp.getName(), functionType,
-      moduleBuilder.getIndexArrayAttr(streamableFunc.tiedOperands), funcAttrs,
-      funcArgAttrs, funcResAttrs);
+      funcOp.getLoc(), funcOp.getName(), functionType, tiedOperandsAttr,
+      funcAttrs, funcArgAttrs, funcResAttrs);
 
   // Swap out the symbol in the symbol table.
   symbolTable.erase(funcOp);
@@ -234,7 +247,7 @@
 }
 
 static LogicalResult convertStreamableCall(StreamableFunc &streamableFunc,
-                                           func::CallOp callOp) {
+                                           IREE::Util::CallOp callOp) {
   OpBuilder builder(callOp);
 
   // Capture all argument dynamic dimensions.
@@ -253,9 +266,10 @@
     // It should return the required number of dynamic dimensions.
     SmallVector<Type> resultDimTypes(streamableFunc.requiredResultDims,
                                      builder.getIndexType());
-    auto calculateCallOp = builder.create<func::CallOp>(
-        callOp.getLoc(), streamableFunc.resultDimsFunc, resultDimTypes,
-        callOp.getOperands());
+    auto calculateCallOp = builder.create<IREE::Util::CallOp>(
+        callOp.getLoc(), resultDimTypes,
+        streamableFunc.resultDimsFunc.getLeafReference().getValue(),
+        callOp.getOperands(), ArrayAttr{});
     llvm::append_range(resultDims, calculateCallOp.getResults());
   } else {
     // Get the shape dimensions from existing call arguments or tied operands.
@@ -301,7 +315,7 @@
 static LogicalResult
 convertStreamableCalls(mlir::ModuleOp moduleOp,
                        DenseMap<StringRef, StreamableFunc> &streamableFuncs) {
-  auto walkResult = moduleOp.walk([&](func::CallOp callOp) {
+  auto walkResult = moduleOp.walk([&](IREE::Util::CallOp callOp) {
     auto it = streamableFuncs.find(callOp.getCallee());
     if (it != streamableFuncs.end()) {
       if (failed(convertStreamableCall(it->second, callOp))) {
@@ -320,8 +334,8 @@
   ConvertStreamableOpsPass(const ConvertStreamableOpsPass &pass) {}
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<func::FuncDialect, mlir::tensor::TensorDialect,
-                    IREE::Flow::FlowDialect>();
+    registry.insert<mlir::tensor::TensorDialect, IREE::Flow::FlowDialect,
+                    IREE::Util::UtilDialect>();
   }
 
   StringRef getArgument() const override {
@@ -337,8 +351,8 @@
     auto moduleOp = getOperation();
 
     // Gather functions that need wrapping.
-    SmallVector<func::FuncOp> originalFuncOps;
-    for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
+    SmallVector<IREE::Util::FuncOp> originalFuncOps;
+    for (auto funcOp : moduleOp.getOps<IREE::Util::FuncOp>()) {
       // Ignore functions already marked as having their ABI goo handled.
       if (funcOp->hasAttr("iree.abi.streamable")) {
         if (!funcOp.isExternal()) {

diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.cpp b/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.cpp
index cac0abc..9c22431 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/Passes.cpp

@@ -10,7 +10,6 @@
 
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Utils/PassUtils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/Passes.h"
@@ -18,7 +17,7 @@
 namespace mlir::iree_compiler::IREE::ABI {
 
 using FunctionLikeNest =
-    MultiOpNest<func::FuncOp, IREE::Util::InitializerOp, IREE::Util::FuncOp>;
+    MultiOpNest<IREE::Util::InitializerOp, IREE::Util::FuncOp>;
 
 void buildTransformPassPipeline(OpPassManager &passManager,
                                 const InvocationOptions &invocationOptions) {

diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp b/compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp
index 6216eab..ed8af82 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp

@@ -7,9 +7,10 @@
 #include "iree/compiler/Bindings/Native/Transforms/Passes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
+#include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Affine/Utils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -59,13 +60,14 @@
 }
 
 // Creates the corresponding wrapper function for the given import function.
-static func::FuncOp
+static IREE::Util::FuncOp
 createImportWrapperFunc(IREE::ABI::InvocationModel invocationModel,
-                        func::FuncOp importOp, FunctionType oldImportType,
-                        FunctionType newImportType, StringRef privateName) {
+                        FunctionOpInterface importOp,
+                        FunctionType oldImportType, FunctionType newImportType,
+                        StringRef privateName) {
   // Create the internal wrapper function with the original import signature.
   auto wrapperOp =
-      func::FuncOp::create(importOp.getLoc(), privateName, oldImportType);
+      IREE::Util::FuncOp::create(importOp.getLoc(), privateName, oldImportType);
   wrapperOp.setPrivate();
 
   // Copy arg/result attrs from the import op to the wrapper function.
@@ -202,8 +204,8 @@
     arguments.push_back(signalFence);
 
   // Make the call with the updated types.
-  auto callOp =
-      entryBuilder.create<func::CallOp>(importOp.getLoc(), importOp, arguments);
+  auto callOp = entryBuilder.create<IREE::Util::CallOp>(importOp.getLoc(),
+                                                        importOp, arguments);
 
   // If the call has side-effects then we need to wait on its signal fence on
   // the host. This is because they may have launched a thread of their own to
@@ -235,7 +237,7 @@
     }
   }
 
-  entryBuilder.create<func::ReturnOp>(importOp.getLoc(), results);
+  entryBuilder.create<IREE::Util::ReturnOp>(importOp.getLoc(), results);
   return wrapperOp;
 }
 
@@ -244,7 +246,7 @@
 // new wrapper function.
 static LogicalResult wrapImportFunc(IREE::ABI::InvocationModel invocationModel,
                                     mlir::ModuleOp moduleOp,
-                                    func::FuncOp importOp,
+                                    FunctionOpInterface importOp,
                                     SymbolTable &symbolTable) {
   // Replace all existing calls to the import to instead call the wrapper.
   auto publicName = importOp.getName().str();
@@ -258,9 +260,8 @@
   }
 
   // Convert import signature types to those required by the binding ABI.
-  auto oldImportType = importOp.getFunctionType();
   SmallVector<Type> inputTypes;
-  for (auto oldType : oldImportType.getInputs()) {
+  for (auto oldType : importOp.getArgumentTypes()) {
     inputTypes.push_back(mapToABIType(oldType));
   }
   auto fenceType = IREE::HAL::FenceType::get(importOp.getContext());
@@ -274,7 +275,7 @@
     break;
   }
   SmallVector<Type> resultTypes;
-  for (auto oldType : oldImportType.getResults()) {
+  for (auto oldType : importOp.getResultTypes()) {
     resultTypes.push_back(mapToABIType(oldType));
   }
   auto newImportType =
@@ -283,7 +284,8 @@
   // Create the wrapper function that matches the original internal types but
   // calls out to the updated import using ABI types.
   auto wrapperOp = createImportWrapperFunc(
-      invocationModel, importOp, oldImportType, newImportType, privateName);
+      invocationModel, importOp, cast<FunctionType>(importOp.getFunctionType()),
+      newImportType, privateName);
   if (!wrapperOp)
     return failure();
   moduleOp.insert(++Block::iterator(importOp), wrapperOp);
@@ -344,7 +346,7 @@
 // meaningful with them (like names).
 static StringAttr
 formatSourceDeclaration(IREE::ABI::InvocationModel invocationModel,
-                        func::FuncOp exportOp, StringRef publicName,
+                        FunctionOpInterface exportOp, StringRef publicName,
                         ArrayAttr allArgAttrs, ArrayAttr allResultAttrs) {
   std::string decl;
   llvm::raw_string_ostream os(decl);
@@ -395,8 +397,8 @@
 // These are attached to the exported function and can be queried at runtime
 // with iree_vm_function_lookup_attr_by_name.
 static void populateReflectionAttrs(IREE::ABI::InvocationModel invocationModel,
-                                    func::FuncOp exportOp,
-                                    func::FuncOp wrapperOp) {
+                                    FunctionOpInterface exportOp,
+                                    IREE::Util::FuncOp wrapperOp) {
   auto *context = exportOp.getContext();
   SmallVector<NamedAttribute> attrs;
 
@@ -441,9 +443,9 @@
 }
 
 // Creates the corresponding wrapper function for the given export function.
-static func::FuncOp
+static IREE::Util::FuncOp
 createExportWrapperFunc(IREE::ABI::InvocationModel invocationModel,
-                        func::FuncOp exportOp, StringRef publicName) {
+                        FunctionOpInterface exportOp, StringRef publicName) {
   // Copy arg/result attrs from the export op to the wrapper function.
   // We may want to remove them from the export but would need to filter.
   SmallVector<DictionaryAttr> argAttrDict;
@@ -458,9 +460,8 @@
   // NOTE: this is where we could change our signature to provide additional
   // values from the runtime bindings as may be required - like semaphores for
   // async behavior or cancellation.
-  auto oldExportType = exportOp.getFunctionType();
   SmallVector<Type> inputTypes;
-  for (auto oldType : oldExportType.getInputs()) {
+  for (auto oldType : exportOp.getArgumentTypes()) {
     inputTypes.push_back(mapToABIType(oldType));
   }
   auto fenceType = IREE::HAL::FenceType::get(exportOp.getContext());
@@ -476,7 +477,7 @@
     break;
   }
   SmallVector<Type> resultTypes;
-  for (auto oldType : oldExportType.getResults()) {
+  for (auto oldType : exportOp.getResultTypes()) {
     resultTypes.push_back(mapToABIType(oldType));
   }
   auto newExportType =
@@ -485,7 +486,7 @@
   // Update the import to the new type and mark it as being converted so we
   // don't try to convert it again.
   auto wrapperOp =
-      func::FuncOp::create(exportOp.getLoc(), publicName, newExportType);
+      IREE::Util::FuncOp::create(exportOp.getLoc(), publicName, newExportType);
   wrapperOp.setPublic();
   wrapperOp->setAttr("iree.abi.stub", UnitAttr::get(exportOp.getContext()));
   wrapperOp.setAllArgAttrs(argAttrDict);
@@ -536,6 +537,7 @@
   }
 
   // Marshal arguments.
+  auto oldExportType = cast<FunctionType>(exportOp.getFunctionType());
   SmallVector<Value> arguments;
   for (auto [argIndex, arg] : llvm::enumerate(
            entryBlock->getArguments().slice(0, oldExportType.getNumInputs()))) {
@@ -555,8 +557,8 @@
   }
 
   // Make the call with the original types.
-  auto callOp =
-      entryBuilder.create<func::CallOp>(exportOp.getLoc(), exportOp, arguments);
+  auto callOp = entryBuilder.create<IREE::Util::CallOp>(exportOp.getLoc(),
+                                                        exportOp, arguments);
   auto asyncResults = llvm::to_vector(callOp.getResults());
 
   // Insert a barrier if requested - all tensors will be calculated and the
@@ -602,7 +604,7 @@
     }
   }
 
-  entryBuilder.create<func::ReturnOp>(exportOp.getLoc(), results);
+  entryBuilder.create<IREE::Util::ReturnOp>(exportOp.getLoc(), results);
   return wrapperOp;
 }
 
@@ -612,20 +614,16 @@
 // bindings can also perform their own equivalent wrapping.
 static LogicalResult wrapExportFunc(IREE::ABI::InvocationModel invocationModel,
                                     mlir::ModuleOp moduleOp,
-                                    func::FuncOp exportOp,
+                                    FunctionOpInterface exportOp,
                                     SymbolTable &symbolTable) {
   // Rename the original function so that our wrapper can use the original
   // name in its public definition.
   auto publicName = exportOp.getName().str();
   auto privateName = "_" + publicName;
-  auto privateNameAttr =
-      mlir::StringAttr::get(exportOp.getContext(), privateName);
-  if (failed(symbolTable.replaceAllSymbolUses(exportOp, privateNameAttr,
-                                              moduleOp))) {
+  if (failed(symbolTable.rename(exportOp, privateName))) {
     return exportOp.emitError() << "unknown symbol table op encountered; "
                                    "cannot fix up symbol names";
   }
-  exportOp.setName(privateNameAttr);
   exportOp.setPrivate();
 
   // Create the wrapper function that conforms to the IREE native ABI and
@@ -634,7 +632,7 @@
       createExportWrapperFunc(invocationModel, exportOp, publicName);
   if (!wrapperOp)
     return failure();
-  moduleOp.insert(Block::iterator(exportOp), wrapperOp);
+  symbolTable.insert(wrapperOp, Block::iterator(exportOp));
 
   return success();
 }
@@ -652,8 +650,8 @@
   }
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<func::FuncDialect, mlir::arith::ArithDialect,
-                    mlir::tensor::TensorDialect, IREE::HAL::HALDialect>();
+    registry.insert<mlir::arith::ArithDialect, mlir::tensor::TensorDialect,
+                    IREE::HAL::HALDialect, IREE::Util::UtilDialect>();
   }
 
   StringRef getArgument() const override {
@@ -670,13 +668,13 @@
     auto moduleOp = getOperation();
 
     // Gather functions that need wrapping.
-    SmallVector<func::FuncOp> importOps;
-    SmallVector<func::FuncOp> exportOps;
-    for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
+    SmallVector<FunctionOpInterface> importOps;
+    SmallVector<FunctionOpInterface> exportOps;
+    for (auto funcOp : moduleOp.getOps<IREE::Util::FuncOp>()) {
       // Ignore functions already marked as having their ABI goo handled.
-      if (funcOp->hasAttr("iree.abi.stub"))
+      if (funcOp->hasAttr("iree.abi.stub")) {
         continue;
-      if (funcOp.isExternal()) {
+      } else if (funcOp.isExternal()) {
         // Imported function.
         importOps.push_back(funcOp);
       } else if (funcOp.isPublic()) {

diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/convert_streamable_ops.mlir b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/convert_streamable_ops.mlir
index 0f74c2a..e889c0e 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/convert_streamable_ops.mlir
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/convert_streamable_ops.mlir

@@ -1,73 +1,9 @@
 // RUN: iree-opt --iree-abi-convert-streamable-ops --cse --split-input-file %s --verify-diagnostics | FileCheck %s
 
-// Tests most of the features of the conversion.
-
-// CHECK: flow.func private @import(%arg0: tensor<?x2xi32> {some.arg_attr}, %arg1: tensor<?x4xf32>, %arg2: i32, %arg3: index) -> (%arg0, tensor<?x4xi8> {some.result_attr})
-func.func private @import(tensor<?x2xi32> {some.arg_attr}, tensor<?x4xf32>, i32, index) ->
-    (tensor<?x2xi32> {iree.abi.tied = 0 : index}, tensor<?x4xi8> {iree.abi.dims = [3 : index], some.result_attr}) attributes {
-  iree.abi.streamable
-}
-
-// CHECK: func.func private @caller
-func.func private @caller(%arg0: tensor<?x2xi32>, %arg1: tensor<?x4xf32>, %arg2: i32, %dim0: index) -> (tensor<?x2xi32>, tensor<?x4xi8>) {
-  // CHECK-DAG: %[[ARG0_DIM0:.+]] = tensor.dim %arg0, %c0
-  // CHECK-DAG: %[[ARG1_DIM0:.+]] = tensor.dim %arg1, %c0
-  // CHECK: %[[RETS:.+]]:2 = flow.call @import(%arg0, %arg1, %arg2, %arg3) : (tensor<?x2xi32>{%[[ARG0_DIM0]]}, tensor<?x4xf32>{%[[ARG1_DIM0]]}, i32, index) -> (%arg0{%[[ARG0_DIM0]]}, tensor<?x4xi8>{%arg3})
-  %0:2 = call @import(%arg0, %arg1, %arg2, %dim0) : (tensor<?x2xi32>, tensor<?x4xf32>, i32, index) -> (tensor<?x2xi32>, tensor<?x4xi8>)
-  // CHECK: return %[[RETS]]#0, %[[RETS]]#1
-  return %0#0, %0#1 : tensor<?x2xi32>, tensor<?x4xi8>
-}
-
-// -----
-
-// Verifies if a user doesn't specify untied result dynamic dims we error out.
-
-// expected-error @+1 {{missing dynamic dimensions on result 0}}
-func.func private @importMissingResultDims(tensor<?x?xi32>, index, index) -> tensor<?x?xf32> attributes {
-  iree.abi.streamable
-}
-
-// -----
-
-// Tests that untied results with dynamic dimensions can resolve them.
-// Users need to specify in such cases.
-
-// CHECK: flow.func private @importWithResultDims(%arg0: tensor<?x?xi32>, %arg1: index, %arg2: index) -> tensor<?x?xf32>
-func.func private @importWithResultDims(tensor<?x?xi32>, index, index) -> (tensor<?x?xf32> {iree.abi.dims = [1 : index, 2 : index]}) attributes {
-  iree.abi.streamable
-}
-
-// CHECK: func.func private @callerWithResultDims
-func.func private @callerWithResultDims(%arg0: tensor<?x?xi32>, %arg1: index, %arg2: index) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[ARG0_DIM0:.+]] = tensor.dim %arg0, %c0
-  // CHECK-DAG: %[[ARG0_DIM1:.+]] = tensor.dim %arg0, %c1
-  // CHECK: %[[RET:.+]] = flow.call @importWithResultDims(%arg0, %arg1, %arg2) : (tensor<?x?xi32>{%[[ARG0_DIM0]], %[[ARG0_DIM1]]}, index, index) -> tensor<?x?xf32>{%arg1, %arg2}
-  %0 = call @importWithResultDims(%arg0, %arg1, %arg2) : (tensor<?x?xi32>, index, index) -> tensor<?x?xf32>
-  // CHECK: return %[[RET]]
-  return %0 : tensor<?x?xf32>
-}
-
-// -----
-
-// Verifies if the user tries specifying result dims and a calculation function
-// we properly error.
-
-func.func private @calculateOverconstrainedResultDims(%arg0: index) -> index {
-  return %arg0 : index
-}
-
-// expected-error @+1 {{cannot have both an explicit result shape calculation function}}
-func.func private @importOverconstrainedResultDims(index) -> (tensor<2x?xf32> {iree.abi.dims = [0 : index]}) attributes {
-  iree.abi.streamable,
-  iree.abi.result_dims = @calculateOverconstrainedResultDims
-}
-
-// -----
-
 // Tests using a shape computation function for computing result dimensions.
 
-// CHECK: func.func private @calculateResultDims
-func.func private @calculateResultDims(%arg0: tensor<1x?xi32>, %arg1: i32, %arg2: tensor<?xf32>) -> (index, index) {
+// CHECK: util.func private @calculateResultDims
+util.func private @calculateResultDims(%arg0: tensor<1x?xi32>, %arg1: i32, %arg2: tensor<?xf32>) -> (index, index) {
   // Could do math here, call other imported host functions, etc. Note that
   // doing anything but tensor.dim on the tensor arguments will cause massive
   // performance penalties and should always be avoided.
@@ -77,60 +13,22 @@
   %c1 = arith.constant 1 : index
   %arg0_dim1 = tensor.dim %arg0, %c1 : tensor<1x?xi32>
   %arg2_dim0 = tensor.dim %arg2, %c0 : tensor<?xf32>
-  return %arg0_dim1, %arg2_dim0 : index, index
+  util.return %arg0_dim1, %arg2_dim0 : index, index
 }
 
 // CHECK: flow.func private @importCustomResultDims(%arg0: tensor<1x?xi32>, %arg1: i32, %arg2: tensor<?xf32>) -> (tensor<2x?xf32>, tensor<?xi8>)
-func.func private @importCustomResultDims(tensor<1x?xi32>, i32, tensor<?xf32>) -> (tensor<2x?xf32>, tensor<?xi8>) attributes {
+util.func private @importCustomResultDims(%arg0: tensor<1x?xi32>, %arg1: i32, %arg2: tensor<?xf32>) -> (tensor<2x?xf32>, tensor<?xi8>) attributes {
   iree.abi.streamable,
   iree.abi.result_dims = @calculateResultDims
 }
 
-// CHECK: func.func private @callerCustomResultDims
-func.func private @callerCustomResultDims(%arg0: tensor<1x?xi32>, %arg1: i32, %arg2: tensor<?xf32>) -> (tensor<2x?xf32>, tensor<?xi8>) {
+// CHECK: util.func private @callerCustomResultDims
+util.func private @callerCustomResultDims(%arg0: tensor<1x?xi32>, %arg1: i32, %arg2: tensor<?xf32>) -> (tensor<2x?xf32>, tensor<?xi8>) {
   // CHECK-DAG: %[[ARG0_DIM1:.+]] = tensor.dim %arg0, %c1
   // CHECK-DAG: %[[ARG2_DIM0:.+]] = tensor.dim %arg2, %c0
-  // CHECK: %[[RET_DIMS:.+]]:2 = call @calculateResultDims(%arg0, %arg1, %arg2) : (tensor<1x?xi32>, i32, tensor<?xf32>) -> (index, index)
+  // CHECK: %[[RET_DIMS:.+]]:2 = util.call @calculateResultDims(%arg0, %arg1, %arg2) : (tensor<1x?xi32>, i32, tensor<?xf32>) -> (index, index)
   // CHECK: %[[RETS:.+]]:2 = flow.call @importCustomResultDims(%arg0, %arg1, %arg2) : (tensor<1x?xi32>{%[[ARG0_DIM1]]}, i32, tensor<?xf32>{%[[ARG2_DIM0]]}) -> (tensor<2x?xf32>{%[[RET_DIMS]]#0}, tensor<?xi8>{%[[RET_DIMS]]#1})
-  %0:2 = call @importCustomResultDims(%arg0, %arg1, %arg2) : (tensor<1x?xi32>, i32, tensor<?xf32>) -> (tensor<2x?xf32>, tensor<?xi8>)
-  // CHECK: return %[[RETS]]#0, %[[RETS]]#1
-  return %0#0, %0#1 : tensor<2x?xf32>, tensor<?xi8>
-}
-
-// -----
-
-// Tests that results tied to operands get handled correctly and reuse the
-// argument shapes.
-
-// CHECK: flow.func private @importWithTies(%arg0: tensor<?x?xi32>) -> %arg0
-func.func private @importWithTies(tensor<?x?xi32>) -> (tensor<?x?xi32> {iree.abi.tied = 0 : index}) attributes {
-  iree.abi.streamable
-}
-
-// CHECK: func.func private @callerWithTies
-func.func private @callerWithTies(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK-DAG: %[[ARG0_DIM0:.+]] = tensor.dim %arg0, %c0
-  // CHECK-DAG: %[[ARG0_DIM1:.+]] = tensor.dim %arg0, %c1
-  // CHECK: %[[RET:.+]] = flow.call @importWithTies(%arg0) : (tensor<?x?xi32>{%[[ARG0_DIM0]], %[[ARG0_DIM1]]}) -> %arg0{%[[ARG0_DIM0]], %[[ARG0_DIM1]]}
-  %0 = call @importWithTies(%arg0) : (tensor<?x?xi32>) -> tensor<?x?xi32>
-  // CHECK: return %[[RET]]
-  return %0 : tensor<?x?xi32>
-}
-
-// -----
-
-// Tests that attrs we don't know about are passed through to the new ops.
-
-// CHECK: flow.func private @importPassThroughAttrs(%arg0: tensor<1xi32> {some.arg_attr}) -> (tensor<1xi8> {some.result_attr}) attributes {some.import_attr}
-func.func private @importPassThroughAttrs(tensor<1xi32> {some.arg_attr}) -> (tensor<1xi8> {some.result_attr}) attributes {
-  iree.abi.streamable,
-  some.import_attr
-}
-
-// CHECK: func.func private @callerPassThroughArgs
-func.func private @callerPassThroughArgs(%arg0: tensor<1xi32>) -> tensor<1xi8> {
-  // CHECK: %[[RET:.+]] = flow.call @importPassThroughAttrs(%arg0) {some.call_attr} : (tensor<1xi32>) -> tensor<1xi8>
-  %0 = call @importPassThroughAttrs(%arg0) {some.call_attr} : (tensor<1xi32>) -> tensor<1xi8>
-  // CHECK: return %[[RET]]
-  return %0 : tensor<1xi8>
+  %0:2 = util.call @importCustomResultDims(%arg0, %arg1, %arg2) : (tensor<1x?xi32>, i32, tensor<?xf32>) -> (tensor<2x?xf32>, tensor<?xi8>)
+  // CHECK: util.return %[[RETS]]#0, %[[RETS]]#1
+  util.return %0#0, %0#1 : tensor<2x?xf32>, tensor<?xi8>
 }

diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir
index 5aa9da3..d1f4751 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir

@@ -2,7 +2,7 @@
 
 // Tests basic dynamic tensor I/O marshaling.
 
-// CHECK-LABEL: func.func @dynamicEntry(
+// CHECK-LABEL: util.func public @dynamicEntry(
 //  CHECK-SAME:   %[[ARG0:.+]]: !hal.buffer_view, %[[ARG1:.+]]: !hal.buffer_view
 //  CHECK-SAME: -> (
 //  CHECK-SAME:   !hal.buffer_view, !hal.buffer_view
@@ -15,20 +15,20 @@
 //  CHECK-NEXT:   %[[ARG0_TENSOR:.+]] = hal.tensor.import %[[ARG0]] "input0" : !hal.buffer_view -> tensor<?x8x8x3xf32>{%[[ARG0_DIM0]]}
 //  CHECK-NEXT:   %[[ARG1_DIM0:.+]] = hal.buffer_view.dim<%[[ARG1]] : !hal.buffer_view>[0] : index
 //  CHECK-NEXT:   %[[ARG1_TENSOR:.+]] = hal.tensor.import %[[ARG1]] "input1" : !hal.buffer_view -> tensor<?x8x8x3xf32>{%[[ARG1_DIM0]]}
-//  CHECK-NEXT:   %[[RET_TENSORS:.+]]:2 = call @_dynamicEntry(%[[ARG0_TENSOR]], %[[ARG1_TENSOR]])
+//  CHECK-NEXT:   %[[RET_TENSORS:.+]]:2 = util.call @_dynamicEntry(%[[ARG0_TENSOR]], %[[ARG1_TENSOR]])
 //       CHECK:   %[[RET0_DIM0:.+]] = tensor.dim %[[RET_TENSORS]]#0, %c0{{.*}} : tensor<?x8x8x3xf32>
 //  CHECK-NEXT:   %[[RET0_VIEW:.+]] = hal.tensor.export %[[RET_TENSORS]]#0 "output0" : tensor<?x8x8x3xf32>{%[[RET0_DIM0]]} -> !hal.buffer_view
 //       CHECK:   %[[RET1_DIM0:.+]] = tensor.dim %[[RET_TENSORS]]#1, %c0{{.*}} : tensor<?x8x8x3xf32>
 //  CHECK-NEXT:   %[[RET1_VIEW:.+]] = hal.tensor.export %[[RET_TENSORS]]#1 "output1" : tensor<?x8x8x3xf32>{%[[RET1_DIM0]]} -> !hal.buffer_view
-//  CHECK-NEXT:   return %[[RET0_VIEW]], %[[RET1_VIEW]] : !hal.buffer_view, !hal.buffer_view
+//  CHECK-NEXT:   util.return %[[RET0_VIEW]], %[[RET1_VIEW]] : !hal.buffer_view, !hal.buffer_view
 //  CHECK-NEXT: }
 
-// CHECK-LABEL: func.func private @_dynamicEntry(
-func.func @dynamicEntry(%arg0: tensor<?x8x8x3xf32>, %arg1: tensor<?x8x8x3xf32>) ->
+// CHECK-LABEL: util.func private @_dynamicEntry(
+util.func public @dynamicEntry(%arg0: tensor<?x8x8x3xf32>, %arg1: tensor<?x8x8x3xf32>) ->
     (tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>) {
   %0 = arith.addf %arg0, %arg1 : tensor<?x8x8x3xf32>
   %1 = arith.addf %0, %arg0 : tensor<?x8x8x3xf32>
-  return %0, %1 : tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>
+  util.return %0, %1 : tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>
 }
 
 // -----
@@ -36,18 +36,18 @@
 // Tests that an existing iree.reflection dictionary is merged with the new
 // reflection information.
 
-// CHECK-LABEL: func.func @existingReflection
+// CHECK-LABEL: util.func public @existingReflection
 // CHECK-SAME: iree.reflection =
 // CHECK-SAME:     iree.abi.declaration = "sync func @existingReflection
 // CHECK-SAME:     some.attr = 4 : index
-// CHECK: func.func private @_existingReflection
+// CHECK: util.func private @_existingReflection
 // CHECK-NOT: iree.reflection = {some.attr = 4 : index}
-func.func @existingReflection() attributes {
+util.func public @existingReflection() attributes {
   iree.reflection = {
     some.attr = 4 : index
   }
 } {
-  return
+  util.return
 }
 
 // -----
@@ -55,47 +55,47 @@
 // Tests that iree.abi.declaration is added when needed and otherwise the user
 // provided value is passed through.
 
-// CHECK-LABEL: func.func @existingDeclaration
+// CHECK-LABEL: util.func public @existingDeclaration
 // CHECK-SAME: iree.reflection =
 // CHECK-SAME:     iree.abi.declaration = "some.python.thing(types_are_overrated)"
-func.func @existingDeclaration(%arg0: tensor<i32>) attributes {
+util.func public @existingDeclaration(%arg0: tensor<i32>) attributes {
   iree.abi.declaration = "some.python.thing(types_are_overrated)"
 } {
-  return
+  util.return
 }
 
 // -----
 
 // Tests that name overrides propagate into both metadata and assertion IR.
 
-// CHECK-LABEL: func.func @namedEntry
+// CHECK-LABEL: util.func public @namedEntry
 // CHECK-SAME: iree.reflection =
 // CHECK-SAME:     iree.abi.declaration = "sync func @namedEntry(%my_input_0: tensor<3xf32>, %input1: tensor<3xf32>) -> (%my_output_0: tensor<3xf32>, %output1: tensor<3xf32>)"
-func.func @namedEntry(%arg0: tensor<3xf32> {iree.abi.name = "my_input_0"}, %arg1: tensor<3xf32>) ->
+util.func public @namedEntry(%arg0: tensor<3xf32> {iree.abi.name = "my_input_0"}, %arg1: tensor<3xf32>) ->
     (tensor<3xf32> {iree.abi.name = "my_output_0"}, tensor<3xf32>) {
   %0 = arith.addf %arg0, %arg1 : tensor<3xf32>
-  return %0, %0 : tensor<3xf32>, tensor<3xf32>
+  util.return %0, %0 : tensor<3xf32>, tensor<3xf32>
 }
 
 // -----
 
 // Tests that exports with encodings specified are propagated to the HAL ops.
 
-// CHECK-LABEL: func.func @exportEncodings
+// CHECK-LABEL: util.func public @exportEncodings
 //  CHECK-SAME:   iree.abi.declaration = "sync func @exportEncodings(%input0: tensor<?x8x8x3xf32> {iree.abi.encoding = tensor<?x8x8x3xi32>}) -> (%output0: tensor<?x8x8x3xf32> {iree.abi.encoding = tensor<?x8x8x3xi32>})"
 // CHECK: hal.tensor.import {{.+}} : !hal.buffer_view -> tensor<?x8x8x3xi32> as tensor<?x8x8x3xf32>{{.+}}
 // CHECK: hal.tensor.export {{.+}} : tensor<?x8x8x3xi32> as tensor<?x8x8x3xf32>{{.+}} -> !hal.buffer_view
 
-// CHECK-LABEL: func.func private @_exportEncodings
-func.func @exportEncodings(%arg0: tensor<?x8x8x3xf32> {iree.abi.encoding = tensor<?x8x8x3xi32>}) -> (tensor<?x8x8x3xf32> {iree.abi.encoding = tensor<?x8x8x3xi32>}) {
-  return %arg0 : tensor<?x8x8x3xf32>
+// CHECK-LABEL: util.func private @_exportEncodings
+util.func public @exportEncodings(%arg0: tensor<?x8x8x3xf32> {iree.abi.encoding = tensor<?x8x8x3xi32>}) -> (tensor<?x8x8x3xf32> {iree.abi.encoding = tensor<?x8x8x3xi32>}) {
+  util.return %arg0 : tensor<?x8x8x3xf32>
 }
 
 // -----
 
 // Tests specifying explicit storage for specific function results.
 
-// CHECK-LABEL: func.func @outputStorage
+// CHECK-LABEL: util.func public @outputStorage
 //  CHECK-SAME:   (%[[ARG0:[a-z0-9]+]]: !hal.buffer_view, %[[RET1_STORAGE:[a-z0-9]+]]: !hal.buffer)
 //  CHECK-SAME: -> (!hal.buffer_view, !hal.buffer_view) attributes {
 //  CHECK-SAME:   iree.abi.stub
@@ -104,54 +104,54 @@
 //  CHECK-SAME: } {
 //  CHECK-NEXT:   %[[ARG0_DIM0:.+]] = hal.buffer_view.dim<%[[ARG0]] : !hal.buffer_view>[0] : index
 //  CHECK-NEXT:   %[[ARG0_TENSOR:.+]] = hal.tensor.import %[[ARG0]] "input0" : !hal.buffer_view -> tensor<?x8x8x3xf32>{%[[ARG0_DIM0]]}
-//  CHECK-NEXT:   %[[RET_TENSORS:.+]]:2 = call @_outputStorage(%[[ARG0_TENSOR]], %[[RET1_STORAGE]])
+//  CHECK-NEXT:   %[[RET_TENSORS:.+]]:2 = util.call @_outputStorage(%[[ARG0_TENSOR]], %[[RET1_STORAGE]])
 //       CHECK:   %[[RET0_DIM0:.+]] = tensor.dim %[[RET_TENSORS]]#0, %c0{{.*}} : tensor<?x8x8x3xf32>
 //  CHECK-NEXT:   %[[RET0_VIEW:.+]] = hal.tensor.export %[[RET_TENSORS]]#0 "output0" : tensor<?x8x8x3xf32>{%[[RET0_DIM0]]} -> !hal.buffer_view
 //       CHECK:   %[[RET1_DIM0:.+]] = tensor.dim %[[RET_TENSORS]]#1, %c0{{.*}} : tensor<?x8x8x3xf32>
 //  CHECK-NEXT:   %[[RET1_VIEW:.+]] = hal.tensor.export %[[RET_TENSORS]]#1 "output1" into(%[[RET1_STORAGE]] : !hal.buffer) : tensor<?x8x8x3xf32>{%[[RET1_DIM0]]} -> !hal.buffer_view
-//  CHECK-NEXT:   return %[[RET0_VIEW]], %[[RET1_VIEW]] : !hal.buffer_view, !hal.buffer_view
+//  CHECK-NEXT:   util.return %[[RET0_VIEW]], %[[RET1_VIEW]] : !hal.buffer_view, !hal.buffer_view
 //  CHECK-NEXT: }
 
-// CHECK-LABEL: func.func private @_outputStorage(
-func.func @outputStorage(%arg0: tensor<?x8x8x3xf32>, %ret1: !hal.buffer {iree.abi.output = 1 : index}) ->
+// CHECK-LABEL: util.func private @_outputStorage(
+util.func public @outputStorage(%arg0: tensor<?x8x8x3xf32>, %ret1: !hal.buffer {iree.abi.output = 1 : index}) ->
     (tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>) {
   %0 = arith.addf %arg0, %arg0 : tensor<?x8x8x3xf32>
   %1 = arith.addf %0, %arg0 : tensor<?x8x8x3xf32>
-  return %0, %1 : tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>
+  util.return %0, %1 : tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>
 }
 
 // -----
 
 // Tests that functions already wrapped (iree.abi.stub present) are ignored.
 
-// CHECK-LABEL: func.func @wrappedAlready
+// CHECK-LABEL: util.func public @wrappedAlready
 //  CHECK-SAME: (%arg0: !hal.buffer_view) -> !hal.buffer_view
 //  CHECK-SAME: attributes {iree.abi.stub}
-func.func @wrappedAlready(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
-  return %arg0 : !hal.buffer_view
+util.func public @wrappedAlready(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
+  util.return %arg0 : !hal.buffer_view
 }
-// CHECK-NOT: func.func @_wrappedAlready
+// CHECK-NOT: util.func public @_wrappedAlready
 
 // -----
 
 // Tests that a function calling an exported function is redirected to the
 // original unwrapped call.
 
-// CHECK-LABEL: func.func @exportA(%arg0: !hal.buffer_view) -> !hal.buffer_view
+// CHECK-LABEL: util.func public @exportA(%arg0: !hal.buffer_view) -> !hal.buffer_view
 // CHECK:   call @_exportA
-// CHECK: func.func private @_exportA(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32>
-// CHECK:   return %arg0
-func.func @exportA(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  return %arg0 : tensor<?x?xi32>
+// CHECK: util.func private @_exportA(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32>
+// CHECK:   util.return %arg0
+util.func public @exportA(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  util.return %arg0 : tensor<?x?xi32>
 }
 
-// CHECK: func.func @exportB(%arg0: !hal.buffer_view) -> !hal.buffer_view
-// CHECK:   call @_exportB
-// CHECK: func.func private @_exportB(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32>
-// CHECK:   call @_exportA
-func.func @exportB(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  %0 = call @exportA(%arg0) : (tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0 : tensor<?x?xi32>
+// CHECK: util.func public @exportB(%arg0: !hal.buffer_view) -> !hal.buffer_view
+// CHECK:   util.call @_exportB
+// CHECK: util.func private @_exportB(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32>
+// CHECK:   util.call @_exportA
+util.func public @exportB(%arg0: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  %0 = util.call @exportA(%arg0) : (tensor<?x?xi32>) -> tensor<?x?xi32>
+  util.return %0 : tensor<?x?xi32>
 }
 
 // -----
@@ -159,44 +159,44 @@
 // Tests that imported functions get converted to canonical ABI types and
 // wrapper functions are built to preserve internal behavior.
 
-// CHECK-LABEL: func.func private @import(!hal.buffer_view) -> !hal.buffer_view
-func.func private @import(tensor<?x2xi32>) -> tensor<2x?xi32>
+// CHECK-LABEL: util.func private @import(%arg0: !hal.buffer_view) -> !hal.buffer_view
+util.func private @import(tensor<?x2xi32>) -> tensor<2x?xi32>
 
-// CHECK: func.func private @_import(%[[ARG_TENSOR:.+]]: tensor<?x2xi32>) -> tensor<2x?xi32> {
+// CHECK: util.func private @_import(%[[ARG_TENSOR:.+]]: tensor<?x2xi32>) -> tensor<2x?xi32> {
 // CHECK:   %[[ARG_DIM:.+]] = tensor.dim %[[ARG_TENSOR]], %c0
 // CHECK:   %[[ARG_VIEW:.+]] = hal.tensor.export %[[ARG_TENSOR]] : tensor<?x2xi32>{%[[ARG_DIM]]} -> !hal.buffer_view
-// CHECK:   %[[RET_VIEW:.+]] = call @import(%[[ARG_VIEW]]) : (!hal.buffer_view) -> !hal.buffer_view
+// CHECK:   %[[RET_VIEW:.+]] = util.call @import(%[[ARG_VIEW]]) : (!hal.buffer_view) -> !hal.buffer_view
 // CHECK:   %[[RET_DIM:.+]] = hal.buffer_view.dim<%[[RET_VIEW]] : !hal.buffer_view>[1]
 // CHECK:   %[[RET_TENSOR:.+]] = hal.tensor.import %[[RET_VIEW]] : !hal.buffer_view -> tensor<2x?xi32>{%[[RET_DIM]]}
-// CHECK:   return %[[RET_TENSOR]]
+// CHECK:   util.return %[[RET_TENSOR]]
 // CHECK: }
 
-// CHECK: func.func private @caller(%arg0: tensor
-func.func private @caller(%arg0: tensor<?x2xi32>) -> tensor<2x?xi32> {
-  // CHECK: call @_import(%arg0) : (tensor<?x2xi32>) -> tensor<2x?xi32>
-  %0 = call @import(%arg0) : (tensor<?x2xi32>) -> tensor<2x?xi32>
-  return %0 : tensor<2x?xi32>
+// CHECK: util.func private @caller(%arg0: tensor
+util.func private @caller(%arg0: tensor<?x2xi32>) -> tensor<2x?xi32> {
+  // CHECK: util.call @_import(%arg0) : (tensor<?x2xi32>) -> tensor<2x?xi32>
+  %0 = util.call @import(%arg0) : (tensor<?x2xi32>) -> tensor<2x?xi32>
+  util.return %0 : tensor<2x?xi32>
 }
 
 // -----
 
 // Tests that imports with encodings specified are propagated to the HAL ops.
 
-// CHECK-LABEL: func.func private @importEncodings(!hal.buffer_view) -> !hal.buffer_view
-func.func private @importEncodings(tensor<?x2xi32> {iree.abi.encoding = tensor<?x2xf32>}) -> (tensor<2x?xi32> {iree.abi.encoding = tensor<2x?xf32>})
+// CHECK-LABEL: util.func private @importEncodings(%arg0: !hal.buffer_view) -> !hal.buffer_view
+util.func private @importEncodings(tensor<?x2xi32> {iree.abi.encoding = tensor<?x2xf32>}) -> (tensor<2x?xi32> {iree.abi.encoding = tensor<2x?xf32>})
 
-// CHECK: func.func private @_importEncodings(%[[ARG_TENSOR:.+]]: tensor<?x2xi32>) -> tensor<2x?xi32> {
+// CHECK: util.func private @_importEncodings(%[[ARG_TENSOR:.+]]: tensor<?x2xi32>) -> tensor<2x?xi32> {
 // CHECK:   %[[ARG_DIM:.+]] = tensor.dim %[[ARG_TENSOR]], %c0
 // CHECK:   %[[ARG_VIEW:.+]] = hal.tensor.export %[[ARG_TENSOR]] : tensor<?x2xi32>{%[[ARG_DIM]]} -> !hal.buffer_view
-// CHECK:   %[[RET_VIEW:.+]] = call @importEncodings(%[[ARG_VIEW]]) : (!hal.buffer_view) -> !hal.buffer_view
+// CHECK:   %[[RET_VIEW:.+]] = util.call @importEncodings(%[[ARG_VIEW]]) : (!hal.buffer_view) -> !hal.buffer_view
 // CHECK:   %[[RET_DIM:.+]] = hal.buffer_view.dim<%[[RET_VIEW]] : !hal.buffer_view>[1]
 // CHECK:   %[[RET_TENSOR:.+]] = hal.tensor.import %[[RET_VIEW]] : !hal.buffer_view -> tensor<2x?xi32>{%[[RET_DIM]]}
-// CHECK:   return %[[RET_TENSOR]]
+// CHECK:   util.return %[[RET_TENSOR]]
 // CHECK: }
 
-// CHECK: func.func private @importEncodingsCaller(%arg0: tensor
-func.func private @importEncodingsCaller(%arg0: tensor<?x2xi32>) -> tensor<2x?xi32> {
+// CHECK: util.func private @importEncodingsCaller(%arg0: tensor
+util.func private @importEncodingsCaller(%arg0: tensor<?x2xi32>) -> tensor<2x?xi32> {
   // CHECK: call @_importEncodings(%arg0) : (tensor<?x2xi32>) -> tensor<2x?xi32>
-  %0 = call @importEncodings(%arg0) : (tensor<?x2xi32>) -> tensor<2x?xi32>
-  return %0 : tensor<2x?xi32>
+  %0 = util.call @importEncodings(%arg0) : (tensor<?x2xi32>) -> tensor<2x?xi32>
+  util.return %0 : tensor<2x?xi32>
 }

diff --git a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points_coarse_fences.mlir b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points_coarse_fences.mlir
index f6ff6d1..5b37ee4 100644
--- a/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points_coarse_fences.mlir
+++ b/compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points_coarse_fences.mlir

@@ -1,6 +1,6 @@
 // RUN: iree-opt --pass-pipeline='builtin.module(iree-abi-wrap-entry-points{invocation-model=coarse-fences})' --split-input-file %s | FileCheck %s
 
-// CHECK-LABEL: func.func @asyncEntry(
+// CHECK-LABEL: util.func public @asyncEntry(
 //  CHECK-SAME:   %[[ARG0:.+]]: !hal.buffer_view, %[[ARG1:.+]]: !hal.buffer_view, %[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence
 //  CHECK-SAME: -> (
 //  CHECK-SAME:   !hal.buffer_view, !hal.buffer_view
@@ -11,93 +11,93 @@
 //  CHECK-SAME: } {
 //  CHECK-NEXT:   %[[ARG0_TENSOR:.+]] = hal.tensor.import wait(%[[WAIT]]) => %[[ARG0]] "input0" : !hal.buffer_view -> tensor<4xf32>
 //  CHECK-NEXT:   %[[ARG1_TENSOR:.+]] = hal.tensor.import wait(%[[WAIT]]) => %[[ARG1]] "input1" : !hal.buffer_view -> tensor<4xf32>
-//  CHECK-NEXT:   %[[RESULT_TENSORS:.+]]:2 = call @_asyncEntry(%[[ARG0_TENSOR]], %[[ARG1_TENSOR]])
+//  CHECK-NEXT:   %[[RESULT_TENSORS:.+]]:2 = util.call @_asyncEntry(%[[ARG0_TENSOR]], %[[ARG1_TENSOR]])
 //  CHECK-NEXT:   %[[READY_TENSORS:.+]]:2 = hal.tensor.barrier join(%[[RESULT_TENSORS]]#0, %[[RESULT_TENSORS]]#1 : tensor<4xf32>, tensor<4xf32>) => %[[SIGNAL]] : !hal.fence
 //  CHECK-NEXT:   %[[RET0_VIEW:.+]] = hal.tensor.export %[[READY_TENSORS]]#0 "output0" : tensor<4xf32> -> !hal.buffer_view
 //  CHECK-NEXT:   %[[RET1_VIEW:.+]] = hal.tensor.export %[[READY_TENSORS]]#1 "output1" : tensor<4xf32> -> !hal.buffer_view
-//  CHECK-NEXT:   return %[[RET0_VIEW]], %[[RET1_VIEW]] : !hal.buffer_view, !hal.buffer_view
+//  CHECK-NEXT:   util.return %[[RET0_VIEW]], %[[RET1_VIEW]] : !hal.buffer_view, !hal.buffer_view
 //  CHECK-NEXT: }
 
-// CHECK-LABEL: func.func private @_asyncEntry(
-func.func @asyncEntry(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
+// CHECK-LABEL: util.func private @_asyncEntry(
+util.func public @asyncEntry(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
   %0 = arith.addf %arg0, %arg1 : tensor<4xf32>
   %1 = arith.addf %0, %arg0 : tensor<4xf32>
-  return %0, %1 : tensor<4xf32>, tensor<4xf32>
+  util.return %0, %1 : tensor<4xf32>, tensor<4xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @bareFunc
+// CHECK-LABEL: util.func public @bareFunc
 //  CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
-//       CHECK:   call @_bareFunc()
+//       CHECK:   util.call @_bareFunc()
 //  CHECK-NEXT:   hal.fence.signal<%[[SIGNAL]] : !hal.fence>
-//  CHECK-NEXT:   return
+//  CHECK-NEXT:   util.return
 
-// CHECK-LABEL: func.func private @_bareFunc(
-func.func @bareFunc() {
-  return
+// CHECK-LABEL: util.func private @_bareFunc(
+util.func public @bareFunc() {
+  util.return
 }
 
 // -----
 
-// CHECK-LABEL: func.func @primitiveArgOnly
+// CHECK-LABEL: util.func public @primitiveArgOnly
 //  CHECK-SAME: (%[[ARG0:.+]]: i32, %[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
-//  CHECK-NEXT:   call @_primitiveArgOnly(%[[ARG0]])
+//  CHECK-NEXT:   util.call @_primitiveArgOnly(%[[ARG0]])
 //  CHECK-NEXT:   hal.fence.signal<%[[SIGNAL]] : !hal.fence>
-//  CHECK-NEXT:   return
+//  CHECK-NEXT:   util.return
 
-// CHECK-LABEL: func.func private @_primitiveArgOnly(
-func.func @primitiveArgOnly(%arg0: i32) {
+// CHECK-LABEL: util.func private @_primitiveArgOnly(
+util.func public @primitiveArgOnly(%arg0: i32) {
   %0 = arith.addi %arg0, %arg0 : i32
   util.optimization_barrier %0 : i32
-  return
+  util.return
 }
 
 // -----
 
-// CHECK-LABEL: func.func @tensorArgOnly
+// CHECK-LABEL: util.func public @tensorArgOnly
 //  CHECK-SAME: (%[[ARG0:.+]]: !hal.buffer_view, %[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
 //       CHECK:   %[[ARG0_TENSOR:.+]] = hal.tensor.import wait(%[[WAIT]]) => %[[ARG0]] "input0" : !hal.buffer_view -> tensor<4xf32>
-//  CHECK-NEXT:   call @_tensorArgOnly(%[[ARG0_TENSOR]])
+//  CHECK-NEXT:   util.call @_tensorArgOnly(%[[ARG0_TENSOR]])
 //  CHECK-NEXT:   hal.fence.signal<%[[SIGNAL]] : !hal.fence>
-//  CHECK-NEXT:   return
+//  CHECK-NEXT:   util.return
 
-// CHECK-LABEL: func.func private @_tensorArgOnly(
-func.func @tensorArgOnly(%arg0: tensor<4xf32>) {
+// CHECK-LABEL: util.func private @_tensorArgOnly(
+util.func public @tensorArgOnly(%arg0: tensor<4xf32>) {
   %0 = arith.addf %arg0, %arg0 : tensor<4xf32>
   util.optimization_barrier %0 : tensor<4xf32>
-  return
+  util.return
 }
 
 // -----
 
-// CHECK-LABEL: func.func @primitiveResultOnly
+// CHECK-LABEL: util.func public @primitiveResultOnly
 //  CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence) -> i32
-//  CHECK-NEXT:   %[[RESULT:.+]] = call @_primitiveResultOnly()
+//  CHECK-NEXT:   %[[RESULT:.+]] = util.call @_primitiveResultOnly()
 //  CHECK-NEXT:   hal.fence.signal<%[[SIGNAL]] : !hal.fence>
-//  CHECK-NEXT:   return %[[RESULT]]
+//  CHECK-NEXT:   util.return %[[RESULT]]
 
-// CHECK-LABEL: func.func private @_primitiveResultOnly(
-func.func @primitiveResultOnly() -> i32 {
+// CHECK-LABEL: util.func private @_primitiveResultOnly(
+util.func public @primitiveResultOnly() -> i32 {
   %0 = arith.constant 8 : i32
   %1 = util.optimization_barrier %0 : i32
-  return %1 : i32
+  util.return %1 : i32
 }
 
 // -----
 
-// CHECK-LABEL: func.func @tensorResultOnly
+// CHECK-LABEL: util.func public @tensorResultOnly
 //  CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence) -> !hal.buffer_view
-//  CHECK-NEXT:   %[[RESULT_TENSOR:.+]] = call @_tensorResultOnly()
+//  CHECK-NEXT:   %[[RESULT_TENSOR:.+]] = util.call @_tensorResultOnly()
 //  CHECK-NEXT:   %[[READY_TENSOR:.+]] = hal.tensor.barrier join(%[[RESULT_TENSOR]] : tensor<4xf32>) => %[[SIGNAL]] : !hal.fence
 //  CHECK-NEXT:   %[[RESULT_VIEW:.+]] = hal.tensor.export %[[READY_TENSOR]]
-//  CHECK-NEXT:   return %[[RESULT_VIEW]]
+//  CHECK-NEXT:   util.return %[[RESULT_VIEW]]
 
-// CHECK-LABEL: func.func private @_tensorResultOnly(
-func.func @tensorResultOnly() -> tensor<4xf32> {
+// CHECK-LABEL: util.func private @_tensorResultOnly(
+util.func public @tensorResultOnly() -> tensor<4xf32> {
   %0 = arith.constant dense<[0.0, 1.0, 2.0, 3.0]> : tensor<4xf32>
   %1 = util.optimization_barrier %0 : tensor<4xf32>
-  return %1 : tensor<4xf32>
+  util.return %1 : tensor<4xf32>
 }
 
 // -----
@@ -108,13 +108,13 @@
 // that is part of their ABI. Users can always manually specify the fences too
 // though that's much more verbose.
 
-// CHECK-LABEL: func.func private @import(!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> (!hal.buffer_view, !hal.buffer_view)
-func.func private @import(tensor<?x2xi32>, tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>) attributes {
+// CHECK-LABEL: util.func private @import(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.fence, %arg3: !hal.fence) -> (!hal.buffer_view, !hal.buffer_view)
+util.func private @import(tensor<?x2xi32>, tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>) attributes {
   iree.abi.model = "coarse-fences",
   nosideeffects
 }
 
-// CHECK: func.func private @_import(%[[ARG0_TENSOR:.+]]: tensor<?x2xi32>, %[[ARG1_TENSOR:.+]]: tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>) {
+// CHECK: util.func private @_import(%[[ARG0_TENSOR:.+]]: tensor<?x2xi32>, %[[ARG1_TENSOR:.+]]: tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>) {
 
 // Prepare fences and put a barrier on input arguments:
 // CHECK:   %[[DEVICE:.+]] = hal.devices.get %{{.+}}
@@ -129,7 +129,7 @@
 // CHECK:   %[[ARG1_VIEW:.+]] = hal.tensor.export %[[ARG_BARRIER]]#1 : tensor<?x3xi32>{%[[ARG1_DIM]]} -> !hal.buffer_view
 
 // Call the import:
-// CHECK:   %[[RET_VIEWS:.+]]:2 = call @import(%[[ARG0_VIEW]], %[[ARG1_VIEW]], %[[WAIT_FENCE]], %[[SIGNAL_FENCE]]) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> (!hal.buffer_view, !hal.buffer_view)
+// CHECK:   %[[RET_VIEWS:.+]]:2 = util.call @import(%[[ARG0_VIEW]], %[[ARG1_VIEW]], %[[WAIT_FENCE]], %[[SIGNAL_FENCE]]) : (!hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> (!hal.buffer_view, !hal.buffer_view)
 
 // Import output results from buffer views:
 // CHECK:   %[[RET0_DIM:.+]] = hal.buffer_view.dim<%[[RET_VIEWS]]#0 : !hal.buffer_view>[1]
@@ -137,53 +137,53 @@
 // CHECK:   %[[RET1_DIM:.+]] = hal.buffer_view.dim<%[[RET_VIEWS]]#1 : !hal.buffer_view>[1]
 // CHECK:   %[[RET1_TENSOR:.+]] = hal.tensor.import wait(%[[SIGNAL_FENCE]]) => %[[RET_VIEWS]]#1 : !hal.buffer_view -> tensor<3x?xi32>{%[[RET1_DIM]]}
 
-// CHECK:   return %[[RET0_TENSOR]], %[[RET1_TENSOR]] : tensor<2x?xi32>, tensor<3x?xi32>
+// CHECK:   util.return %[[RET0_TENSOR]], %[[RET1_TENSOR]] : tensor<2x?xi32>, tensor<3x?xi32>
 // CHECK: }
 
-// CHECK: func.func private @caller(%[[ARG0_CALLER:.+]]: tensor<?x2xi32>, %[[ARG1_CALLER:.+]]: tensor<?x3xi32>)
-func.func private @caller(%arg0: tensor<?x2xi32>, %arg1: tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>) {
-  // CHECK: %[[RESULTS:.+]]:2 = call @_import(%[[ARG0_CALLER]], %[[ARG1_CALLER]]) : (tensor<?x2xi32>, tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>)
-  %results:2 = call @import(%arg0, %arg1) : (tensor<?x2xi32>, tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>)
-  // CHECK-NEXT: return %[[RESULTS]]#0, %[[RESULTS]]#1
-  return %results#0, %results#1 : tensor<2x?xi32>, tensor<3x?xi32>
+// CHECK: util.func private @caller(%[[ARG0_CALLER:.+]]: tensor<?x2xi32>, %[[ARG1_CALLER:.+]]: tensor<?x3xi32>)
+util.func private @caller(%arg0: tensor<?x2xi32>, %arg1: tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>) {
+  // CHECK: %[[RESULTS:.+]]:2 = util.call @_import(%[[ARG0_CALLER]], %[[ARG1_CALLER]]) : (tensor<?x2xi32>, tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>)
+  %results:2 = util.call @import(%arg0, %arg1) : (tensor<?x2xi32>, tensor<?x3xi32>) -> (tensor<2x?xi32>, tensor<3x?xi32>)
+  // CHECK-NEXT: util.return %[[RESULTS]]#0, %[[RESULTS]]#1
+  util.return %results#0, %results#1 : tensor<2x?xi32>, tensor<3x?xi32>
 }
 
 // -----
 
 // Tests a side-effect-free import that doesn't take/return reference types.
 
-// CHECK-LABEL: func.func private @importI32(i32, !hal.fence, !hal.fence) -> i32
-func.func private @importI32(i32) -> i32 attributes {
+// CHECK-LABEL: util.func private @importI32(%arg0: i32, %arg1: !hal.fence, %arg2: !hal.fence) -> i32
+util.func private @importI32(i32) -> i32 attributes {
   iree.abi.model = "coarse-fences",
   nosideeffects
 }
 
 // No fences required as the call has no side-effects and no async resources.
-// CHECK: func.func private @_importI32(%[[ARG0:.+]]: i32) -> i32 {
+// CHECK: util.func private @_importI32(%[[ARG0:.+]]: i32) -> i32 {
 // CHECK:   %[[WAIT_FENCE:.+]] = util.null : !hal.fence
 // CHECK:   %[[SIGNAL_FENCE:.+]] = util.null : !hal.fence
-// CHECK:   %[[RET0:.+]] = call @importI32(%[[ARG0]], %[[WAIT_FENCE]], %[[SIGNAL_FENCE]]) : (i32, !hal.fence, !hal.fence) -> i32
-// CHECK:   return %[[RET0]] : i32
+// CHECK:   %[[RET0:.+]] = util.call @importI32(%[[ARG0]], %[[WAIT_FENCE]], %[[SIGNAL_FENCE]]) : (i32, !hal.fence, !hal.fence) -> i32
+// CHECK:   util.return %[[RET0]] : i32
 // CHECK: }
 
-// CHECK: func.func private @callerI32(%[[ARG0_CALLER:.+]]: i32)
-func.func private @callerI32(%arg0: i32) -> i32 {
-  // CHECK: %[[RESULT:.+]] = call @_importI32(%[[ARG0_CALLER]]) : (i32) -> i32
-  %result = call @importI32(%arg0) : (i32) -> i32
-  // CHECK-NEXT: return %[[RESULT]]
-  return %result : i32
+// CHECK: util.func private @callerI32(%[[ARG0_CALLER:.+]]: i32)
+util.func private @callerI32(%arg0: i32) -> i32 {
+  // CHECK: %[[RESULT:.+]] = util.call @_importI32(%[[ARG0_CALLER]]) : (i32) -> i32
+  %result = util.call @importI32(%arg0) : (i32) -> i32
+  // CHECK-NEXT: util.return %[[RESULT]]
+  util.return %result : i32
 }
 
 // -----
 
 // Tests a side-effecting import that requires a host-side wait.
 
-// CHECK-LABEL: func.func private @importI32Effects(!hal.buffer_view, !hal.fence, !hal.fence) -> i32
-func.func private @importI32Effects(tensor<4xf32>) -> i32 attributes {
+// CHECK-LABEL: util.func private @importI32Effects(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> i32
+util.func private @importI32Effects(tensor<4xf32>) -> i32 attributes {
   iree.abi.model = "coarse-fences"
 }
 
-// CHECK: func.func private @_importI32Effects(%[[ARG0_TENSOR:.+]]: tensor<4xf32>) -> i32 {
+// CHECK: util.func private @_importI32Effects(%[[ARG0_TENSOR:.+]]: tensor<4xf32>) -> i32 {
 
 // Wait for the inputs to be ready and create the signal fence to wait on.
 // CHECK:   %[[DEVICE:.+]] = hal.devices.get %{{.+}}
@@ -195,18 +195,18 @@
 // CHECK:   %[[ARG0_VIEW:.+]] = hal.tensor.export %[[ARG0_BARRIER]] : tensor<4xf32> -> !hal.buffer_view
 
 // Make the import call:
-// CHECK:   %[[RET0:.+]] = call @importI32Effects(%[[ARG0_VIEW]], %[[WAIT_FENCE]], %[[SIGNAL_FENCE]]) : (!hal.buffer_view, !hal.fence, !hal.fence) -> i32
+// CHECK:   %[[RET0:.+]] = util.call @importI32Effects(%[[ARG0_VIEW]], %[[WAIT_FENCE]], %[[SIGNAL_FENCE]]) : (!hal.buffer_view, !hal.fence, !hal.fence) -> i32
 
 // Perform host-side wait.
 // CHECK:   hal.fence.await until([%[[SIGNAL_FENCE]]])
 
-// CHECK:   return %[[RET0]] : i32
+// CHECK:   util.return %[[RET0]] : i32
 // CHECK: }
 
-// CHECK: func.func private @callerI32Effects(%[[ARG0_CALLER:.+]]: tensor<4xf32>)
-func.func private @callerI32Effects(%arg0: tensor<4xf32>) -> i32 {
-  // CHECK: %[[RESULT:.+]] = call @_importI32Effects(%[[ARG0_CALLER]]) : (tensor<4xf32>) -> i32
-  %result = call @importI32Effects(%arg0) : (tensor<4xf32>) -> i32
-  // CHECK-NEXT: return %[[RESULT]]
-  return %result : i32
+// CHECK: util.func private @callerI32Effects(%[[ARG0_CALLER:.+]]: tensor<4xf32>)
+util.func private @callerI32Effects(%arg0: tensor<4xf32>) -> i32 {
+  // CHECK: %[[RESULT:.+]] = util.call @_importI32Effects(%[[ARG0_CALLER]]) : (tensor<4xf32>) -> i32
+  %result = util.call @importI32Effects(%arg0) : (tensor<4xf32>) -> i32
+  // CHECK-NEXT: util.return %[[RESULT]]
+  util.return %result : i32
 }

diff --git a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/BUILD.bazel
index 754d084..154bf8d 100644
--- a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/BUILD.bazel
+++ b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/BUILD.bazel

@@ -30,7 +30,6 @@
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",

diff --git a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/CMakeLists.txt
index 66b8fda..70fe8ae 100644
--- a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/CMakeLists.txt

@@ -22,7 +22,6 @@
     LLVMSupport
     MLIRAffineUtils
     MLIRControlFlowDialect
-    MLIRFuncDialect
     MLIRFunctionInterfaces
     MLIRIR
     MLIRPass

diff --git a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/Passes.cpp b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/Passes.cpp
index 8f7f989..323864e 100644
--- a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/Passes.cpp
+++ b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/Passes.cpp

@@ -10,7 +10,6 @@
 
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Utils/PassUtils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/Passes.h"
@@ -18,7 +17,7 @@
 namespace mlir::iree_compiler::IREE::TFLite {
 
 using FunctionLikeNest =
-    MultiOpNest<func::FuncOp, IREE::Util::InitializerOp, IREE::Util::FuncOp>;
+    MultiOpNest<IREE::Util::InitializerOp, IREE::Util::FuncOp>;
 
 void buildTransformPassPipeline(OpPassManager &passManager) {
   // Wraps the entry points in a "_tflite_xx" function and adds shape support.

diff --git a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp
index 95f9381..5e0c539 100644
--- a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp
+++ b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/WrapEntryPoints.cpp

@@ -12,7 +12,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/MLIRContext.h"
@@ -46,9 +45,8 @@
     : public PassWrapper<WrapEntryPointsPass, OperationPass<ModuleOp>> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::func::FuncDialect, mlir::arith::ArithDialect,
-                    mlir::tensor::TensorDialect, IREE::HAL::HALDialect,
-                    IREE::Util::UtilDialect>();
+    registry.insert<mlir::arith::ArithDialect, mlir::tensor::TensorDialect,
+                    IREE::HAL::HALDialect, IREE::Util::UtilDialect>();
   }
 
   StringRef getArgument() const override {
@@ -60,13 +58,13 @@
            "bindings";
   }
 
-  static StringAttr getArgId(func::FuncOp funcOp, int i) {
+  static StringAttr getArgId(IREE::Util::FuncOp funcOp, int i) {
     StringAttr id =
         funcOp.getArgAttrOfType<StringAttr>(i, "ml_program.identifier");
     return id ? id : funcOp.getArgAttrOfType<StringAttr>(i, "iree.identifier");
   }
 
-  static StringAttr getResultId(func::FuncOp funcOp, int i) {
+  static StringAttr getResultId(IREE::Util::FuncOp funcOp, int i) {
     StringAttr id =
         funcOp.getResultAttrOfType<StringAttr>(i, "ml_program.identifier");
     return id ? id
@@ -76,8 +74,8 @@
   void runOnOperation() override {
     auto moduleOp = getOperation();
 
-    SmallVector<func::FuncOp> entryFuncOps;
-    for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
+    SmallVector<IREE::Util::FuncOp> entryFuncOps;
+    for (auto funcOp : moduleOp.getOps<IREE::Util::FuncOp>()) {
       if (funcOp.isPublic() && !funcOp->hasAttr("iree.abi.stub")) {
         entryFuncOps.push_back(funcOp);
       }
@@ -139,7 +137,7 @@
   // Creates dynamic dim globals for each input and output of |funcOp|.
   static std::pair<SmallVector<DynamicDims>, SmallVector<DynamicDims>>
   createDynamicDimGlobals(Location loc, StringRef namePrefix,
-                          mlir::func::FuncOp funcOp, OpBuilder &moduleBuilder) {
+                          IREE::Util::FuncOp funcOp, OpBuilder &moduleBuilder) {
     auto funcType = funcOp.getFunctionType();
 
     // TFLite requires the tensor names at runtime. If they've previously been
@@ -187,14 +185,14 @@
   }
 
   // Derives a shape calculation function from the given entry point |funcOp|.
-  static mlir::func::FuncOp createShapeCalculationFunc(
-      Location loc, StringRef namePrefix, mlir::func::FuncOp funcOp,
+  static IREE::Util::FuncOp createShapeCalculationFunc(
+      Location loc, StringRef namePrefix, IREE::Util::FuncOp funcOp,
       ArrayRef<DynamicDims> inputDynamicDims,
       ArrayRef<DynamicDims> outputDynamicDims,
       IREE::Util::GlobalOp dirtyGlobalOp, OpBuilder &moduleBuilder) {
     // Clone the entire entry function with all its IR.
     auto calcFuncOp =
-        cast<mlir::func::FuncOp>(moduleBuilder.clone(*funcOp.getOperation()));
+        cast<IREE::Util::FuncOp>(moduleBuilder.clone(*funcOp.getOperation()));
     calcFuncOp.setName(
         moduleBuilder.getStringAttr(namePrefix.str() + "_calculate_shapes"));
     calcFuncOp.setPrivate();
@@ -242,7 +240,7 @@
     // Replace each exit from the function with a storage back to the shape
     // variables.
     for (auto returnOp :
-         llvm::to_vector(calcFuncOp.getOps<mlir::func::ReturnOp>())) {
+         llvm::to_vector(calcFuncOp.getOps<IREE::Util::ReturnOp>())) {
       auto exitLoc = returnOp.getLoc();
       OpBuilder exitBuilder(returnOp);
 
@@ -264,11 +262,11 @@
       auto falseValue =
           exitBuilder.createOrFold<arith::ConstantIntOp>(exitLoc, 0, 1);
       dirtyGlobalOp.createStoreOp(exitLoc, falseValue, exitBuilder);
-      exitBuilder.create<mlir::func::ReturnOp>(exitLoc);
+      exitBuilder.create<IREE::Util::ReturnOp>(exitLoc);
       returnOp.erase();
     }
 
-    OpBuilder::atBlockBegin(returnBlock).create<mlir::func::ReturnOp>(loc);
+    OpBuilder::atBlockBegin(returnBlock).create<IREE::Util::ReturnOp>(loc);
 
     return calcFuncOp;
   }
@@ -363,7 +361,7 @@
   void createQueryInputShapeFunc(Location loc, StringRef namePrefix,
                                  ArrayRef<DynamicDims> inputDynamicDims,
                                  OpBuilder &moduleBuilder) {
-    auto queryFuncOp = moduleBuilder.create<mlir::func::FuncOp>(
+    auto queryFuncOp = moduleBuilder.create<IREE::Util::FuncOp>(
         loc, namePrefix.str() + "_query_input_shape",
         moduleBuilder.getFunctionType(/*inputs=*/
                                       TypeRange{
@@ -385,7 +383,7 @@
         entryBuilder);
 
     auto exitBuilder = OpBuilder::atBlockBegin(exitBlock);
-    exitBuilder.create<mlir::func::ReturnOp>(loc);
+    exitBuilder.create<IREE::Util::ReturnOp>(loc);
   }
 
   // Creates a function to resize |inputGlobalOps| and sets the |dirtyGlobalOp|
@@ -396,7 +394,7 @@
                                   ArrayRef<DynamicDims> inputDynamicDims,
                                   IREE::Util::GlobalOp dirtyGlobalOp,
                                   OpBuilder &moduleBuilder) {
-    auto resizeFuncOp = moduleBuilder.create<mlir::func::FuncOp>(
+    auto resizeFuncOp = moduleBuilder.create<IREE::Util::FuncOp>(
         loc, namePrefix.str() + "_resize_input_shape",
         moduleBuilder.getFunctionType(/*inputs=*/
                                       TypeRange{
@@ -421,7 +419,7 @@
     auto exitBuilder = OpBuilder::atBlockBegin(exitBlock);
     auto trueValue = exitBuilder.createOrFold<arith::ConstantIntOp>(loc, 1, 1);
     dirtyGlobalOp.createStoreOp(loc, trueValue, exitBuilder);
-    exitBuilder.create<mlir::func::ReturnOp>(loc);
+    exitBuilder.create<IREE::Util::ReturnOp>(loc);
   }
 
   // Creates a function to query the |outputGlobalOps| at runtime by the
@@ -430,9 +428,9 @@
   // func.func @_query_output_shape(%index : index, %shape : !util.list<index>)
   void createQueryOutputShapeFunc(Location loc, StringRef namePrefix,
                                   ArrayRef<DynamicDims> outputDynamicDims,
-                                  mlir::func::FuncOp calculateShapeFuncOp,
+                                  IREE::Util::FuncOp calculateShapeFuncOp,
                                   OpBuilder &moduleBuilder) {
-    auto queryFuncOp = moduleBuilder.create<func::FuncOp>(
+    auto queryFuncOp = moduleBuilder.create<IREE::Util::FuncOp>(
         loc, namePrefix.str() + "_query_output_shape",
         moduleBuilder.getFunctionType(/*inputs=*/
                                       TypeRange{
@@ -448,7 +446,8 @@
 
     // Always call the recalculation function - it checks for whether it needs
     // to run based on the dirty flag value.
-    entryBuilder.create<mlir::func::CallOp>(loc, calculateShapeFuncOp);
+    entryBuilder.create<IREE::Util::CallOp>(loc, calculateShapeFuncOp,
+                                            ValueRange{});
 
     auto *exitBlock = buildSwitch(
         loc, entryBlock->getArgument(0), outputDynamicDims.size(),
@@ -458,7 +457,7 @@
         entryBuilder);
 
     auto exitBuilder = OpBuilder::atBlockBegin(exitBlock);
-    exitBuilder.create<mlir::func::ReturnOp>(loc);
+    exitBuilder.create<IREE::Util::ReturnOp>(loc);
   }
 
   // Creates the corresponding wrapper function for the given entry point.
@@ -472,7 +471,7 @@
   //
   // NOTE: today we only support a single entry point; with minor tweaks we
   // could fix this up to support multiple if we wanted.
-  void createWrapperFunc(StringRef namePrefix, mlir::func::FuncOp entryFuncOp,
+  void createWrapperFunc(StringRef namePrefix, IREE::Util::FuncOp entryFuncOp,
                          ArrayRef<DynamicDims> inputDynamicDims,
                          ArrayRef<DynamicDims> outputDynamicDims,
                          IREE::Util::GlobalOp dirtyGlobalOp,
@@ -487,7 +486,7 @@
     auto wrapperFuncType =
         moduleBuilder.getFunctionType(inputTypes, outputTypes);
 
-    auto wrapperFuncOp = moduleBuilder.create<mlir::func::FuncOp>(
+    auto wrapperFuncOp = moduleBuilder.create<IREE::Util::FuncOp>(
         entryFuncOp.getLoc(), "_tflite_main", wrapperFuncType);
     wrapperFuncOp.setPublic();
     wrapperFuncOp.getOperation()->setAttr("iree.abi.stub",
@@ -527,7 +526,7 @@
           TypeAttr::get(inputDynamicDims.tensorType), dynamicDims,
           /*wait_fence=*/Value{}, /*name=*/nullptr));
     }
-    auto callOp = entryBuilder.create<mlir::func::CallOp>(
+    auto callOp = entryBuilder.create<IREE::Util::CallOp>(
         entryFuncOp.getLoc(), entryFuncOp, callOperands);
     SmallVector<Value> callResults;
     for (auto [result, outputDynamicDims] :
@@ -554,11 +553,11 @@
         entryBuilder.create<arith::ConstantIntOp>(entryFuncOp.getLoc(), 0, 1),
         entryBuilder);
 
-    entryBuilder.create<mlir::func::ReturnOp>(entryFuncOp.getLoc(),
+    entryBuilder.create<IREE::Util::ReturnOp>(entryFuncOp.getLoc(),
                                               callResults);
   }
 
-  void wrapEntryPoint(mlir::func::FuncOp funcOp) {
+  void wrapEntryPoint(IREE::Util::FuncOp funcOp) {
     auto loc = funcOp.getLoc();
     auto namePrefix = ("_tflite_" + funcOp.getName()).str();
     OpBuilder moduleBuilder(funcOp);
@@ -601,8 +600,8 @@
 
   // Populates attributes on |wrapperFuncOp| to support runtime reflection like
   // IO tensor names and quantization information.
-  void populateReflectionAttrs(mlir::func::FuncOp entryFuncOp,
-                               mlir::func::FuncOp wrapperFuncOp) {
+  void populateReflectionAttrs(IREE::Util::FuncOp entryFuncOp,
+                               IREE::Util::FuncOp wrapperFuncOp) {
     SmallVector<NamedAttribute> attrs;
     attrs.push_back(buildIONamesAttr(entryFuncOp));
     // TODO(#3972): tfl.io.quant: quantization information.
@@ -615,7 +614,7 @@
   //   tfl.io.names=arg0;arg1;ret0;ret1
   //
   // Default names will be used if no identifiers are set on the function.
-  NamedAttribute buildIONamesAttr(mlir::func::FuncOp entryFuncOp) {
+  NamedAttribute buildIONamesAttr(IREE::Util::FuncOp entryFuncOp) {
     SmallVector<std::string> pieces;
     for (int i = 0; i < entryFuncOp.getNumArguments(); ++i) {
       auto identifierAttr = getArgId(entryFuncOp, i);

diff --git a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/test/wrap_entry_points.mlir b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/test/wrap_entry_points.mlir
index fc087c1..5d9216b 100644
--- a/compiler/src/iree/compiler/Bindings/TFLite/Transforms/test/wrap_entry_points.mlir
+++ b/compiler/src/iree/compiler/Bindings/TFLite/Transforms/test/wrap_entry_points.mlir

@@ -14,7 +14,7 @@
 
 
 
-// CHECK-LABEL: func.func private @_tflite_dynamicEntry_calculate_shapes() {
+// CHECK-LABEL: util.func private @_tflite_dynamicEntry_calculate_shapes() {
 
 // Only recalculate shapes if the shapes are dirty.
 //       CHECK:   %[[IS_DIRTY:.+]] = util.global.load @_tflite_dynamicEntry_shapes_dirty : i1
@@ -43,16 +43,16 @@
 
 // Clear dirty bit now that the shapes have been recalculated.
 //       CHECK:   util.global.store %false, @_tflite_dynamicEntry_shapes_dirty : i1
-//  CHECK-NEXT:   return
+//  CHECK-NEXT:   util.return
 
 // Exit for when the shapes are not dirty and no work is needed.
 //  CHECK-NEXT: ^bb2:
-//  CHECK-NEXT:   return
+//  CHECK-NEXT:   util.return
 //  CHECK-NEXT: }
 
 
 
-// CHECK-LABEL: func.func @_tflite_dynamicEntry_query_input_shape
+// CHECK-LABEL: util.func public @_tflite_dynamicEntry_query_input_shape
 //  CHECK-SAME:   (%[[INDEX:.+]]: index, %[[LIST:.+]]: !util.list<index>)
 
 // Query input0 shape:
@@ -82,12 +82,12 @@
 
 // Invalid input index:
 //       CHECK: ^bb4:
-//  CHECK-NEXT:   return
+//  CHECK-NEXT:   util.return
 //  CHECK-NEXT: }
 
 
 
-// CHECK-LABEL: func.func @_tflite_dynamicEntry_resize_input_shape
+// CHECK-LABEL: util.func public @_tflite_dynamicEntry_resize_input_shape
 //  CHECK-SAME:   (%[[INDEX:.+]]: index, %[[LIST:.+]]: !util.list<index>)
 
 //       CHECK:   %[[IS_0:.+]] = arith.cmpi eq, %[[INDEX]], %c0 : index
@@ -108,12 +108,12 @@
 // Set the dirty flag so that shape calculation must run again.
 //  CHECK-NEXT: ^bb4:
 //  CHECK-NEXT:   util.global.store %true, @_tflite_dynamicEntry_shapes_dirty : i1
-//  CHECK-NEXT:   return
+//  CHECK-NEXT:   util.return
 //  CHECK-NEXT: }
 
 
 
-// CHECK-LABEL: func.func @_tflite_dynamicEntry_query_output_shape
+// CHECK-LABEL: util.func public @_tflite_dynamicEntry_query_output_shape
 //  CHECK-SAME:   (%[[INDEX:.+]]: index, %[[LIST:.+]]: !util.list<index>)
 
 // Recalculate shapes, if needed.
@@ -145,12 +145,12 @@
 //  CHECK-NEXT:   cf.br ^bb4
 
 //  CHECK-NEXT: ^bb4:
-//  CHECK-NEXT:   return
+//  CHECK-NEXT:   util.return
 //  CHECK-NEXT: }
 
 
 
-// CHECK-LABEL: func.func @_tflite_main(
+// CHECK-LABEL: util.func public @_tflite_main(
 //  CHECK-SAME:   %[[IN0_BUFFER:.+]]: !hal.buffer {iree.identifier = "input0"},
 //  CHECK-SAME:   %[[IN1_BUFFER:.+]]: !hal.buffer {iree.identifier = "input1"})
 //  CHECK-SAME: -> (
@@ -172,7 +172,7 @@
 // CHECK-NEXT:   %[[IN1:.+]] = hal.tensor.import %[[IN1_BUFFER]] : !hal.buffer -> tensor<?x8x8x3xf32>{%[[IN1_DIM0]]}
 
 // Call the original function with tensor arguments.
-//      CHECK:   %[[OUT:.+]]:2 = call @dynamicEntry(%[[IN0]], %[[IN1]]) : (tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>) -> (tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>)
+//      CHECK:   %[[OUT:.+]]:2 = util.call @dynamicEntry(%[[IN0]], %[[IN1]]) : (tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>) -> (tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>)
 
 // Query output0 shape and get the HAL buffer to return.
 //      CHECK:   %[[OUT0_DIM0:.+]] = tensor.dim %[[OUT]]#0, %c0 : tensor<?x8x8x3xf32>
@@ -187,13 +187,13 @@
 // Clear shape dirty bit as we've updated the shapes unconditionally.
 // CHECK-NEXT:   util.global.store %false, @_tflite_dynamicEntry_shapes_dirty : i1
 
-// CHECK-NEXT:   return %[[OUT0_BUFFER]], %[[OUT1_BUFFER]]
+// CHECK-NEXT:   util.return %[[OUT0_BUFFER]], %[[OUT1_BUFFER]]
 // CHECK-NEXT: }
 
 
 
-// CHECK-LABEL: func.func private @dynamicEntry(
-func.func @dynamicEntry(
+// CHECK-LABEL: util.func private @dynamicEntry(
+util.func public @dynamicEntry(
   %arg0: tensor<?x8x8x3xf32> {iree.identifier = "input0"},
   %arg1: tensor<?x8x8x3xf32> {iree.identifier = "input1"}
 ) -> (
@@ -204,13 +204,13 @@
   %0 = arith.addf %arg0, %arg1 : tensor<?x8x8x3xf32>
   // CHECK: = arith.addf
   %1 = arith.addf %0, %arg0 : tensor<?x8x8x3xf32>
-  // CHECK: return
-  return %0, %1 : tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>
+  // CHECK: util.return
+  util.return %0, %1 : tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @_tflite_main(
+// CHECK-LABEL: util.func public @_tflite_main(
 //  CHECK-SAME:   %[[IN0_BUFFER:.+]]: !hal.buffer,
 //  CHECK-SAME:   %[[IN1_BUFFER:.+]]: !hal.buffer)
 //  CHECK-SAME: -> (
@@ -223,7 +223,7 @@
 //  CHECK-SAME:   }
 //  CHECK-SAME: } {
 
-func.func @dynamicEntryWithoutIdentifiers(
+util.func public @dynamicEntryWithoutIdentifiers(
   %arg0: tensor<?x8x8x3xf32>,
   %arg1: tensor<?x8x8x3xf32>
 ) -> (
@@ -234,6 +234,6 @@
   %0 = arith.addf %arg0, %arg1 : tensor<?x8x8x3xf32>
   // CHECK: = arith.addf
   %1 = arith.addf %0, %arg0 : tensor<?x8x8x3xf32>
-  // CHECK: return
-  return %0, %1 : tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>
+  // CHECK: util.return
+  util.return %0, %1 : tensor<?x8x8x3xf32>, tensor<?x8x8x3xf32>
 }

diff --git a/compiler/src/iree/compiler/ConstEval/test/compile_regressions.mlir b/compiler/src/iree/compiler/ConstEval/test/compile_regressions.mlir
index 72064e4..ff74a8d 100644
--- a/compiler/src/iree/compiler/ConstEval/test/compile_regressions.mlir
+++ b/compiler/src/iree/compiler/ConstEval/test/compile_regressions.mlir

@@ -2,7 +2,7 @@
 
 // Test case reduced by running the pass --iree-util-hoist-into-globals on the
 // following (and then chang the check to a return):
-// func.func @i1_inline_constant() {
+// util.func public @i1_inline_constant() {
 //   %control = arith.constant dense<[true, false, true, false]> : tensor<4xi1>
 //   %a = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
 //   %b = arith.constant dense<[5, 6, 7, 8]> : tensor<4xi32>
@@ -18,7 +18,7 @@
 //       linalg.yield %0 : i32
 //     } -> tensor<4xi32>
 //   check.expect_eq_const(%c, dense<[1, 6, 3, 8]> : tensor<4xi32>) : tensor<4xi32>
-//   return
+//   util.return
 // }
 
 // CHECK-LABEL: module @hoisted_tensor_i1_input
@@ -27,9 +27,9 @@
 #map = affine_map<(d0) -> (d0)>
 module @hoisted_tensor_i1_input {
   util.global private @hoisted : tensor<4xi32>
-  func.func @i1_inline_constant() -> tensor<4xi32> {
+  util.func public @i1_inline_constant() -> tensor<4xi32> {
     %hoisted = util.global.load @hoisted : tensor<4xi32>
-    return %hoisted : tensor<4xi32>
+    util.return %hoisted : tensor<4xi32>
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant dense<[true, false, true, false]> : tensor<4xi1>

diff --git a/compiler/src/iree/compiler/ConstEval/test/failing.mlir b/compiler/src/iree/compiler/ConstEval/test/failing.mlir
index c2fb16c..5fabbeb 100644
--- a/compiler/src/iree/compiler/ConstEval/test/failing.mlir
+++ b/compiler/src/iree/compiler/ConstEval/test/failing.mlir

@@ -6,9 +6,9 @@
 module @eval_i64_scalar {
   util.global private @offset : f64 = -2.0 : f64
   util.global private @hoisted : f64
-  func.func @main() -> f64 {
+  util.func public @main() -> f64 {
     %hoisted = util.global.load @hoisted : f64
-    return %hoisted : f64
+    util.return %hoisted : f64
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant 44.0 : f64

diff --git a/compiler/src/iree/compiler/ConstEval/test/jit_globals.mlir b/compiler/src/iree/compiler/ConstEval/test/jit_globals.mlir
index f662751..13429e8 100644
--- a/compiler/src/iree/compiler/ConstEval/test/jit_globals.mlir
+++ b/compiler/src/iree/compiler/ConstEval/test/jit_globals.mlir

@@ -4,9 +4,9 @@
 
 module @no_uninitialized {
   util.global private @hoisted : tensor<5x6xf32> = dense<4.0> : tensor<5x6xf32>
-  func.func @main() -> tensor<5x6xf32> {
+  util.func public @main() -> tensor<5x6xf32> {
     %hoisted = util.global.load @hoisted : tensor<5x6xf32>
-    return %hoisted : tensor<5x6xf32>
+    util.return %hoisted : tensor<5x6xf32>
   }
 }
 
@@ -17,9 +17,9 @@
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @linalg_tensor_jit {
   util.global private @hoisted : tensor<5x6xf32>
-  func.func @main() -> tensor<5x6xf32> {
+  util.func public @main() -> tensor<5x6xf32> {
     %hoisted = util.global.load @hoisted : tensor<5x6xf32>
-    return %hoisted : tensor<5x6xf32>
+    util.return %hoisted : tensor<5x6xf32>
   }
   // CHECK-NOT: util.initializer
   util.initializer attributes {iree.compiler.consteval} {
@@ -45,9 +45,9 @@
 // CHECK: util.global private @{{.*}} = dense<2> : tensor<2xi32>
 module @eval_splat_detection {
   util.global private @hoisted : tensor<2xi32>
-  func.func @main() -> tensor<2xi32> {
+  util.func public @main() -> tensor<2xi32> {
     %hoisted = util.global.load @hoisted : tensor<2xi32>
-    return %hoisted : tensor<2xi32>
+    util.return %hoisted : tensor<2xi32>
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant dense<[2, 2]> : tensor<2xi32>
@@ -61,9 +61,9 @@
 // CHECK-LABEL: @eval_f16_tensor
 module @eval_f16_tensor {
   util.global private @hoisted : tensor<5x6xf16>
-  func.func @main() -> tensor<5x6xf16> {
+  util.func public @main() -> tensor<5x6xf16> {
     %hoisted = util.global.load @hoisted : tensor<5x6xf16>
-    return %hoisted : tensor<5x6xf16>
+    util.return %hoisted : tensor<5x6xf16>
   }
   // expected-warning @+1 {{unsupported type for current jit configuration}}
   util.initializer attributes {iree.compiler.consteval} {
@@ -78,9 +78,9 @@
 // Not currently supported (initializer should remain)
 module @eval_bf16_tensor {
   util.global private @hoisted : tensor<5x6xbf16>
-  func.func @main() -> tensor<5x6xbf16> {
+  util.func public @main() -> tensor<5x6xbf16> {
     %hoisted = util.global.load @hoisted : tensor<5x6xbf16>
-    return %hoisted : tensor<5x6xbf16>
+    util.return %hoisted : tensor<5x6xbf16>
   }
   // expected-warning @+1 {{unsupported type for current jit configuration}}
   util.initializer attributes {iree.compiler.consteval} {
@@ -95,9 +95,9 @@
 // CHECK: util.global private @{{.*}} = dense<[2.000000e+02, 3.200000e+03]> : tensor<2xf32>
 module @eval_f32_tensor {
   util.global private @hoisted : tensor<2xf32>
-  func.func @main() -> tensor<2xf32> {
+  util.func public @main() -> tensor<2xf32> {
     %hoisted = util.global.load @hoisted : tensor<2xf32>
-    return %hoisted : tensor<2xf32>
+    util.return %hoisted : tensor<2xf32>
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant dense<[2.0e+2, 3.2e+3]> : tensor<2xf32>
@@ -110,9 +110,9 @@
 // CHECK-LABEL: @eval_f64_tensor
 module @eval_f64_tensor {
   util.global private @hoisted : tensor<2xf64>
-  func.func @main() -> tensor<2xf64> {
+  util.func public @main() -> tensor<2xf64> {
     %hoisted = util.global.load @hoisted : tensor<2xf64>
-    return %hoisted : tensor<2xf64>
+    util.return %hoisted : tensor<2xf64>
   }
   // expected-warning @+1 {{unsupported type for current jit configuration}}
   util.initializer attributes {iree.compiler.consteval} {
@@ -127,9 +127,9 @@
 // CHECK: util.global private @{{.*}} = dense<[false, true, false, true, true, false]> : tensor<6xi1>
 module @eval_i1_tensor {
   util.global private @hoisted : tensor<6xi1>
-  func.func @main() -> tensor<6xi1> {
+  util.func public @main() -> tensor<6xi1> {
     %hoisted = util.global.load @hoisted : tensor<6xi1>
-    return %hoisted : tensor<6xi1>
+    util.return %hoisted : tensor<6xi1>
   }
   util.initializer attributes {iree.compiler.consteval} {
     // Note that the level we are testing at is a bit odd in the way i1 vs
@@ -145,9 +145,9 @@
 // CHECK-LABEL: @eval_i4_tensor
 module @eval_i4_tensor {
   util.global private @hoisted : tensor<5x6xi4>
-  func.func @main() -> tensor<5x6xi4> {
+  util.func public @main() -> tensor<5x6xi4> {
     %hoisted = util.global.load @hoisted : tensor<5x6xi4>
-    return %hoisted : tensor<5x6xi4>
+    util.return %hoisted : tensor<5x6xi4>
   }
   // expected-warning @+1 {{unsupported type for current jit configuration}}
   util.initializer attributes {iree.compiler.consteval} {
@@ -162,9 +162,9 @@
 // CHECK: util.global private @{{.*}} = dense<[2, 3]> : tensor<2xi8>
 module @eval_i8_tensor {
   util.global private @hoisted : tensor<2xi8>
-  func.func @main() -> tensor<2xi8> {
+  util.func public @main() -> tensor<2xi8> {
     %hoisted = util.global.load @hoisted : tensor<2xi8>
-    return %hoisted : tensor<2xi8>
+    util.return %hoisted : tensor<2xi8>
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant dense<[2, 3]> : tensor<2xi8>
@@ -178,9 +178,9 @@
 // CHECK: util.global private @{{.*}} = dense<[2, 3]> : tensor<2xi16>
 module @eval_i16_tensor {
   util.global private @hoisted : tensor<2xi16>
-  func.func @main() -> tensor<2xi16> {
+  util.func public @main() -> tensor<2xi16> {
     %hoisted = util.global.load @hoisted : tensor<2xi16>
-    return %hoisted : tensor<2xi16>
+    util.return %hoisted : tensor<2xi16>
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant dense<[2, 3]> : tensor<2xi16>
@@ -194,9 +194,9 @@
 // CHECK: util.global private @{{.*}} = dense<[2, 3]> : tensor<2xi32>
 module @eval_i32_tensor {
   util.global private @hoisted : tensor<2xi32>
-  func.func @main() -> tensor<2xi32> {
+  util.func public @main() -> tensor<2xi32> {
     %hoisted = util.global.load @hoisted : tensor<2xi32>
-    return %hoisted : tensor<2xi32>
+    util.return %hoisted : tensor<2xi32>
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant dense<[2, 3]> : tensor<2xi32>
@@ -210,9 +210,9 @@
 // CHECK: util.global private @{{.*}} = dense<[2, 3]> : tensor<2xi64>
 module @eval_i64_tensor {
   util.global private @hoisted : tensor<2xi64>
-  func.func @main() -> tensor<2xi64> {
+  util.func public @main() -> tensor<2xi64> {
     %hoisted = util.global.load @hoisted : tensor<2xi64>
-    return %hoisted : tensor<2xi64>
+    util.return %hoisted : tensor<2xi64>
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant dense<[2, 3]> : tensor<2xi64>
@@ -227,9 +227,9 @@
 // CHECK: util.global private @{{.*}} = dense<2> : tensor<2xi64>
 module @eval_i64_tensor_splat {
   util.global private @hoisted : tensor<2xi64>
-  func.func @main() -> tensor<2xi64> {
+  util.func public @main() -> tensor<2xi64> {
     %hoisted = util.global.load @hoisted : tensor<2xi64>
-    return %hoisted : tensor<2xi64>
+    util.return %hoisted : tensor<2xi64>
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant dense<2> : tensor<2xi64>
@@ -245,9 +245,9 @@
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @serializable_attrs {
   util.global private @hoisted : tensor<5x6xi8>
-  func.func @main() -> tensor<5x6xi8> {
+  util.func public @main() -> tensor<5x6xi8> {
     %hoisted = util.global.load @hoisted : tensor<5x6xi8>
-    return %hoisted : tensor<5x6xi8>
+    util.return %hoisted : tensor<5x6xi8>
   }
   util.global private @constant = #util.byte_pattern<1> : tensor<5x6xi8>
   // CHECK-NOT: util.initializer

diff --git a/compiler/src/iree/compiler/ConstEval/test/scalar_values.mlir b/compiler/src/iree/compiler/ConstEval/test/scalar_values.mlir
index 633c639..734165f 100644
--- a/compiler/src/iree/compiler/ConstEval/test/scalar_values.mlir
+++ b/compiler/src/iree/compiler/ConstEval/test/scalar_values.mlir

@@ -5,9 +5,9 @@
 module @eval_i8_scalar {
   util.global private @offset : i8 = -2 : i8
   util.global private @hoisted : i8
-  func.func @main() -> i8 {
+  util.func public @main() -> i8 {
     %hoisted = util.global.load @hoisted : i8
-    return %hoisted : i8
+    util.return %hoisted : i8
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant 44 : i8
@@ -24,9 +24,9 @@
 module @eval_i16_scalar {
   util.global private @offset : i16 = -2 : i16
   util.global private @hoisted : i16
-  func.func @main() -> i16 {
+  util.func public @main() -> i16 {
     %hoisted = util.global.load @hoisted : i16
-    return %hoisted : i16
+    util.return %hoisted : i16
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant 44 : i16
@@ -43,9 +43,9 @@
 module @eval_i32_scalar {
   util.global private @offset : i32 = -2 : i32
   util.global private @hoisted : i32
-  func.func @main() -> i32 {
+  util.func public @main() -> i32 {
     %hoisted = util.global.load @hoisted : i32
-    return %hoisted : i32
+    util.return %hoisted : i32
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant 44 : i32
@@ -62,9 +62,9 @@
 module @eval_i64_scalar {
   util.global private @offset : i64 = -2 : i64
   util.global private @hoisted : i64
-  func.func @main() -> i64 {
+  util.func public @main() -> i64 {
     %hoisted = util.global.load @hoisted : i64
-    return %hoisted : i64
+    util.return %hoisted : i64
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant 44 : i64
@@ -81,9 +81,9 @@
 module @eval_f32_scalar {
   util.global private @offset : f32 = -2.0 : f32
   util.global private @hoisted : f32
-  func.func @main() -> f32 {
+  util.func public @main() -> f32 {
     %hoisted = util.global.load @hoisted : f32
-    return %hoisted : f32
+    util.return %hoisted : f32
   }
   util.initializer attributes {iree.compiler.consteval} {
     %cst = arith.constant 44.0 : f32

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/channel_creation.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/channel_creation.mlir
index fe9cbae..f09e8a1 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/channel_creation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/channel_creation.mlir

@@ -7,11 +7,11 @@
   // CHECK-NOT: util.global private @_mesh_mesh_1d_axes_0 {inlining_policy = #util.inline.never} : !flow.channel
   mesh.mesh @mesh_1d(shape = 2)
 
-  func.func @f(
+   util.func public @f(
       %arg0 : tensor<1xi8>) -> tensor<1xi8> {
     %0 = mesh.all_reduce %arg0 on @mesh_1d mesh_axes = [0] reduction = <sum>
       : tensor<1xi8> -> tensor<1xi8>
-    return %0 : tensor<1xi8>
+    util.return %0 : tensor<1xi8>
   }
 }
 
@@ -33,9 +33,9 @@
     // CHECK: util.global.store %[[CHANNEL]], @_mesh_mesh_2d_axes_1 : !flow.channel
   mesh.mesh @mesh_2d(shape = 3x4)
 
-  func.func @f(%input : tensor<1xi8>) -> tensor<1xi8> {
+   util.func public @f(%input : tensor<1xi8>) -> tensor<1xi8> {
     %out = mesh.all_reduce %input on @mesh_2d mesh_axes = [1] : tensor<1xi8> -> tensor<1xi8>
-    return %out : tensor<1xi8>
+    util.return %out : tensor<1xi8>
   }
 }
 
@@ -65,9 +65,9 @@
     // CHECK: util.global.store %[[CHANNEL]], @_mesh_mesh_4d_axes_2_1 : !flow.channel
   mesh.mesh @mesh_4d(shape = 3x4x5x6)
 
-  func.func @f(%input : tensor<1xi8>) -> tensor<1xi8> {
+   util.func public @f(%input : tensor<1xi8>) -> tensor<1xi8> {
     %out = mesh.all_reduce %input on @mesh_4d mesh_axes = [2, 1] : tensor<1xi8> -> tensor<1xi8>
-    return %out : tensor<1xi8>
+    util.return %out : tensor<1xi8>
   }
 }
 
@@ -80,10 +80,10 @@
   // CHECK-DAG: util.global private @_mesh_mesh_2d_axes_1 {inlining_policy = #util.inline.never} : !flow.channel
   mesh.mesh @mesh_2d(shape = 3x4)
 
-  func.func @f(%input : tensor<1xi8>) -> (tensor<1xi8>, tensor<1xi8>) {
+   util.func public @f(%input : tensor<1xi8>) -> (tensor<1xi8>, tensor<1xi8>) {
     %out0 = mesh.all_reduce %input on @mesh_2d mesh_axes = [0] : tensor<1xi8> -> tensor<1xi8>
     %out1 = mesh.all_reduce %input on @mesh_2d mesh_axes = [1] : tensor<1xi8> -> tensor<1xi8>
-    return %out0, %out1 : tensor<1xi8>, tensor<1xi8>
+    util.return %out0, %out1 : tensor<1xi8>, tensor<1xi8>
   }
 }
 
@@ -95,10 +95,10 @@
   // CHECK: util.global private @_mesh_mesh_2d_axes_0 {inlining_policy = #util.inline.never} : !flow.channel
   mesh.mesh @mesh_2d(shape = 3x4)
 
-  func.func @f(%input0 : tensor<1xi8>, %input1 : tensor<1xi8>) -> (tensor<1xi8>, tensor<1xi8>) {
+   util.func public @f(%input0 : tensor<1xi8>, %input1 : tensor<1xi8>) -> (tensor<1xi8>, tensor<1xi8>) {
     %out0 = mesh.all_reduce %input0 on @mesh_2d mesh_axes = [0] : tensor<1xi8> -> tensor<1xi8>
     %out1 = mesh.all_reduce %input1 on @mesh_2d mesh_axes = [0] : tensor<1xi8> -> tensor<1xi8>
-    return %out0, %out1 : tensor<1xi8>, tensor<1xi8>
+    util.return %out0, %out1 : tensor<1xi8>, tensor<1xi8>
   }
 }
 
@@ -122,9 +122,9 @@
     // CHECK: util.global.store %[[CHANNEL]], @_mesh_mesh2_axes_1 : !flow.channel
   mesh.mesh @mesh2(shape = 3x4)
 
-  func.func @f(%input0 : tensor<1xi8>, %input1 : tensor<1xi8>) -> (tensor<1xi8>, tensor<1xi8>) {
+   util.func public @f(%input0 : tensor<1xi8>, %input1 : tensor<1xi8>) -> (tensor<1xi8>, tensor<1xi8>) {
     %out0 = mesh.all_reduce %input0 on @mesh1 mesh_axes = [0] : tensor<1xi8> -> tensor<1xi8>
     %out1 = mesh.all_reduce %input1 on @mesh2 mesh_axes = [1] : tensor<1xi8> -> tensor<1xi8>
-    return %out0, %out1 : tensor<1xi8>, tensor<1xi8>
+    util.return %out0, %out1 : tensor<1xi8>, tensor<1xi8>
   }
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/collectives.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/collectives.mlir
index 60d16ab..ef7eb2d 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/collectives.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/MeshToFlow/test/collectives.mlir

@@ -2,8 +2,8 @@
 
 mesh.mesh @mesh_2d(shape = 3x4)
 
-// CHECK-LABEL: func @all_gather_non_default_channel
-func.func @all_gather_non_default_channel(
+// CHECK-LABEL: util.func public @all_gather_non_default_channel
+ util.func public @all_gather_non_default_channel(
     // CHECK-SAME: %[[ARG:.*]]: tensor<3x4xi8>
     %arg0 : tensor<3x4xi8>) -> tensor<3x16xi8> {
   // CHECK-DAG: %[[CHANNEL:.*]] = util.global.load @_mesh_mesh_2d_axes_1 : !flow.channel
@@ -19,16 +19,16 @@
   // CHECK-SAME: ins(%[[ALL_GATHER_RES]] : tensor<16x3xi8>) outs(%[[RES_INIT_VAL]] : tensor<3x16xi8>) permutation = [1, 0]
   %0 = mesh.all_gather %arg0 on @mesh_2d mesh_axes = [1] gather_axis = 1
     : tensor<3x4xi8> -> tensor<3x16xi8>
-  // CHECK: return %[[RES]] : tensor<3x16xi8>
-  return %0 : tensor<3x16xi8>
+  // CHECK: util.return %[[RES]] : tensor<3x16xi8>
+  util.return %0 : tensor<3x16xi8>
 }
 
 // -----
 
 mesh.mesh @mesh_1d(shape = 2)
 
-// CHECK-LABEL: func @all_reduce_sum_default_channel
-func.func @all_reduce_sum_default_channel(
+// CHECK-LABEL: util.func public @all_reduce_sum_default_channel
+ util.func public @all_reduce_sum_default_channel(
     // CHECK-SAME: %[[ARG:.*]]: tensor<1xi8>
     %arg0 : tensor<1xi8>) -> tensor<1xi8> {
   // CHECK: %[[CHANNEL:.*]] = flow.channel.default : !flow.channel
@@ -37,16 +37,16 @@
   // CHECK-SAME: (tensor<1xi8>, tensor<1xi8>, !flow.channel) -> %[[INITIAL_VAL]] as tensor<1xi8>
   %0 = mesh.all_reduce %arg0 on @mesh_1d mesh_axes = [0]
     : tensor<1xi8> -> tensor<1xi8>
-  // CHECK: return %[[RES]] : tensor<1xi8>
-  return %0 : tensor<1xi8>
+  // CHECK: util.return %[[RES]] : tensor<1xi8>
+  util.return %0 : tensor<1xi8>
 }
 
 // -----
 
 mesh.mesh @mesh_2d(shape = 2x2)
 
-// CHECK-LABEL: func @all_reduce_min_non_default_channel
-func.func @all_reduce_min_non_default_channel(
+// CHECK-LABEL: util.func public @all_reduce_min_non_default_channel
+ util.func public @all_reduce_min_non_default_channel(
     // CHECK-SAME: %[[ARG:.*]]: tensor<1xi8>
     %arg0 : tensor<1xi8>) -> tensor<1xi8> {
   // CHECK-DAG: %[[CHANNEL:.*]] = util.global.load @_mesh_mesh_2d_axes_1_0 : !flow.channel
@@ -55,16 +55,16 @@
   // CHECK-SAME: (tensor<1xi8>, tensor<1xi8>, !flow.channel) -> %[[INITIAL_VAL]] as tensor<1xi8>
   %0 = mesh.all_reduce %arg0 on @mesh_2d mesh_axes = [1, 0] reduction = <min>
     : tensor<1xi8> -> tensor<1xi8>
-  // CHECK: return %[[RES]] : tensor<1xi8>
-  return %0 : tensor<1xi8>
+  // CHECK: util.return %[[RES]] : tensor<1xi8>
+  util.return %0 : tensor<1xi8>
 }
 
 // -----
 
 mesh.mesh @mesh_1d(shape = 2)
 
-// CHECK-LABEL: func @all_reduce_f32
-func.func @all_reduce_f32(
+// CHECK-LABEL: util.func public @all_reduce_f32
+ util.func public @all_reduce_f32(
     // CHECK-SAME: %[[ARG:.*]]: tensor<1xf32>
     %arg0 : tensor<1xf32>) -> tensor<1xf32> {
   // CHECK-DAG: %[[CHANNEL:.*]] = flow.channel.default : !flow.channel
@@ -73,29 +73,29 @@
   // CHECK-SAME: (tensor<1xf32>, tensor<1xf32>, !flow.channel) -> %[[INITIAL_VAL]] as tensor<1xf32>
   %0 = mesh.all_reduce %arg0 on @mesh_1d mesh_axes = [0]
     : tensor<1xf32> -> tensor<1xf32>
-  // CHECK: return %[[RES]] : tensor<1xf32>
-  return %0 : tensor<1xf32>
+  // CHECK: util.return %[[RES]] : tensor<1xf32>
+  util.return %0 : tensor<1xf32>
 }
 
 // -----
 
 mesh.mesh @mesh_1d(shape = 2)
 
-// CHECK-LABEL: func @process_linear_index
-func.func @process_linear_index() -> index {
+// CHECK-LABEL: util.func public @process_linear_index
+ util.func public @process_linear_index() -> index {
   // CHECK: %[[CHANNEL:.*]] = flow.channel.default : !flow.channel
   // CHECK: %[[RES:.*]] = flow.channel.rank %[[CHANNEL]] : index
   %0 = mesh.process_linear_index on @mesh_1d : index
-  // CHECK: return %[[RES]] : index
-  return %0 : index
+  // CHECK: util.return %[[RES]] : index
+  util.return %0 : index
 }
 
 // -----
 
 mesh.mesh @mesh_3d(shape = 2x3x4)
 
-// CHECK-LABEL: func @all_to_all_non_default_channel
-func.func @all_to_all_non_default_channel(
+// CHECK-LABEL: util.func public @all_to_all_non_default_channel
+ util.func public @all_to_all_non_default_channel(
     // CHECK-SAME: %[[ARG:.*]]: tensor<1x12x3x4x5xf32>
     %arg0 : tensor<1x12x3x4x5xf32>) -> tensor<1x2x3x24x5xf32> {
   // CHECK: %[[CHANNEL:.*]] = util.global.load @_mesh_mesh_3d_axes_1_0 : !flow.channel
@@ -113,16 +113,16 @@
   // CHECK-SAME-LITERAL: [[0], [1], [2], [3, 4], [5]] : tensor<1x2x3x6x4x5xf32> into tensor<1x2x3x24x5xf32>
   %0 = mesh.all_to_all %arg0 on @mesh_3d mesh_axes = [1, 0] split_axis = 1 concat_axis = 3
     : tensor<1x12x3x4x5xf32> -> tensor<1x2x3x24x5xf32>
-  // CHECK: return %[[COLLAPSED_SPLIT_COUNT_INTO_CONCAT_AXIS]] : tensor<1x2x3x24x5xf32>
-  return %0 : tensor<1x2x3x24x5xf32>
+  // CHECK: util.return %[[COLLAPSED_SPLIT_COUNT_INTO_CONCAT_AXIS]] : tensor<1x2x3x24x5xf32>
+  util.return %0 : tensor<1x2x3x24x5xf32>
 }
 
 // -----
 
 mesh.mesh @mesh_2d(shape = 2x2)
 
-// CHECK-LABEL: func @reduce_scatter_non_default_channel
-func.func @reduce_scatter_non_default_channel(
+// CHECK-LABEL: util.func public @reduce_scatter_non_default_channel
+ util.func public @reduce_scatter_non_default_channel(
     // CHECK-SAME: %[[ARG:.*]]: tensor<3x2xi8>
     %arg0 : tensor<3x2xi8>) -> tensor<3x1xi8> {
   // CHECK-DAG: %[[CHANNEL:.*]] = util.global.load @_mesh_mesh_2d_axes_0 : !flow.channel
@@ -138,6 +138,6 @@
   // CHECK-SAME: ins(%[[REDUCE_SCATTER_RES]] : tensor<1x3xi8>) outs(%[[RES_INIT_VAL]] : tensor<3x1xi8>) permutation = [1, 0]
   %0 = mesh.reduce_scatter %arg0 on @mesh_2d mesh_axes = [0] scatter_axis = 1
     : tensor<3x2xi8> -> tensor<3x1xi8>
-  // CHECK: return %[[RES]] : tensor<3x1xi8>
-  return %0 : tensor<3x1xi8>
+  // CHECK: util.return %[[RES]] : tensor<3x1xi8>
+  util.return %0 : tensor<3x1xi8>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/bitcast.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/bitcast.mlir
index c7a4c1b..49d4527 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/bitcast.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/bitcast.mlir

@@ -1,19 +1,19 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-convert-to-flow %s | FileCheck %s
 
-func.func @static_tensor_bitcast(%arg0: tensor<4x4xf32>) -> tensor<4x4xi32> {
+ util.func public @static_tensor_bitcast(%arg0: tensor<4x4xf32>) -> tensor<4x4xi32> {
   // CHECK-DAG: %[[RESULT:.*]] = flow.tensor.bitcast %arg0 : tensor<4x4xf32> -> tensor<4x4xi32>
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.bitcast %arg0 : tensor<4x4xf32> to tensor<4x4xi32>
-  return %0 : tensor<4x4xi32>
+  util.return %0 : tensor<4x4xi32>
 }
 
 // -----
 
-func.func @dynamic_tensor_bitcast(%arg0: tensor<?x?xf32>) -> tensor<?x?xi32> {
+ util.func public @dynamic_tensor_bitcast(%arg0: tensor<?x?xf32>) -> tensor<?x?xi32> {
   // CHECK: %[[DIM0:.+]] = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   // CHECK: %[[DIM1:.+]] = tensor.dim %arg0, %c1 : tensor<?x?xf32>
   // CHECK: %[[RESULT:.+]] = flow.tensor.bitcast %arg0 : tensor<?x?xf32>{%[[DIM0]], %[[DIM1]]} -> tensor<?x?xi32>{%[[DIM0]], %[[DIM1]]}
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.bitcast %arg0 : tensor<?x?xf32> to tensor<?x?xi32>
-  return %0 : tensor<?x?xi32>
+  util.return %0 : tensor<?x?xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/cast.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/cast.mlir
index 2e9aea5..3fbb254 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/cast.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/cast.mlir

@@ -1,38 +1,38 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-convert-to-flow %s | FileCheck %s
 
-func.func @static_tensor_cast_to_dynamic(%arg0: tensor<4x4xf32>) -> tensor<?x?xf32> {
+ util.func public @static_tensor_cast_to_dynamic(%arg0: tensor<4x4xf32>) -> tensor<?x?xf32> {
   // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
   // CHECK-DAG: %[[RESULT:.*]] = flow.tensor.reshape %arg0 : tensor<4x4xf32> -> tensor<?x?xf32>{%[[C4]], %[[C4]]}
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.cast %arg0 : tensor<4x4xf32> to tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
 
 // -----
 
-func.func @dynamic_tensor_cast_to_static(%arg0: tensor<?xf32>) -> tensor<4xf32> {
+ util.func public @dynamic_tensor_cast_to_static(%arg0: tensor<?xf32>) -> tensor<4xf32> {
   // CHECK: %[[C4:.*]] = arith.constant 4 : index
   // CHECK: %[[RESULT:.*]] = flow.tensor.reshape %arg0 : tensor<?xf32>{%[[C4]]} -> tensor<4xf32>
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.cast %arg0 : tensor<?xf32> to tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
 
-func.func @dynamic_tensor_cast_to_dynamic(%arg0: tensor<?x?xf32>) -> tensor<?x3xf32> {
+ util.func public @dynamic_tensor_cast_to_dynamic(%arg0: tensor<?x?xf32>) -> tensor<?x3xf32> {
   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   // CHECK-DAG: %[[D0:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?x?xf32>
   // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
   // CHECK: %[[RESULT:.*]] = flow.tensor.reshape %arg0 : tensor<?x?xf32>{%[[D0]], %[[C3]]} -> tensor<?x3xf32>{%[[D0]]}
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.cast %arg0 : tensor<?x?xf32> to tensor<?x3xf32>
-  return %0 : tensor<?x3xf32>
+  util.return %0 : tensor<?x3xf32>
 }
 
 // -----
 
-func.func @tensor_cast_within_dispatch_workgroups_not_converted() -> tensor<f32> {
+ util.func public @tensor_cast_within_dispatch_workgroups_not_converted() -> tensor<f32> {
   %x = arith.constant 100 : index
   %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<f32>) = () {
     // CHECK: = tensor.cast %[[source:.+]] : tensor<4x4xf32> to tensor<?x?xf32>
@@ -41,5 +41,5 @@
     "test.sink"(%2) : (tensor<?x?xf32>) -> ()
     flow.return
   }
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/extract.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/extract.mlir
index ed6ccce..34ff6f5 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/extract.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/extract.mlir

@@ -1,17 +1,17 @@
 // RUN: iree-opt --split-input-file --iree-flow-convert-to-flow %s | FileCheck %s
 
-func.func @tensor_extract(%arg0 : tensor<1xi32>, %arg1 : index) -> i32 {
+ util.func public @tensor_extract(%arg0 : tensor<1xi32>, %arg1 : index) -> i32 {
   // CHECK: %[[RESULT:.*]] = flow.tensor.load %arg0[%arg1] : tensor<1xi32>
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %extract = tensor.extract %arg0[%arg1] : tensor<1xi32>
-  return %extract : i32
+  util.return %extract : i32
 }
 
 // -----
 
-func.func @tensor_extract_i1(%arg0 : tensor<1xi1>, %arg1 : index) -> i1 {
+ util.func public @tensor_extract_i1(%arg0 : tensor<1xi1>, %arg1 : index) -> i1 {
   // CHECK: %[[RESULT:.*]] = flow.tensor.load %arg0[%arg1] : tensor<1xi1>
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %extract = tensor.extract %arg0[%arg1] : tensor<1xi1>
-  return %extract : i1
+  util.return %extract : i1
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/extract_slice.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/extract_slice.mlir
index 476cc58..8d1ea5e 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/extract_slice.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/extract_slice.mlir

@@ -1,11 +1,11 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-convert-to-flow %s | FileCheck %s
 
-func.func @extract_slice1(%arg0 : tensor<5x24x48xf32>) -> tensor<4xf32> {
+ util.func public @extract_slice1(%arg0 : tensor<5x24x48xf32>) -> tensor<4xf32> {
   %0 = tensor.extract_slice %arg0[2, 3, 4] [1, 1, 4] [1, 1, 1]
       : tensor<5x24x48xf32> to tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
-// CHECK-LABEL: func.func @extract_slice1(
+// CHECK-LABEL:  util.func public @extract_slice1(
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<5x24x48xf32>)
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 //   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
@@ -13,16 +13,16 @@
 //   CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
 //       CHECK:   %[[SLICE:.+]] = flow.tensor.slice %[[ARG0]][%[[C2]], %[[C3]], %[[C4]] for %[[C1]], %[[C1]], %[[C4]]]
 //       CHECK:   %[[RESULT:.+]] = flow.tensor.reshape %[[SLICE]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @extract_slice2(%arg0 : tensor<5x24x48xf32>) -> tensor<2x48xf32> {
+ util.func public @extract_slice2(%arg0 : tensor<5x24x48xf32>) -> tensor<2x48xf32> {
   %0 = tensor.extract_slice %arg0[2, 3, 0] [1, 2, 48] [1, 1, 1]
       : tensor<5x24x48xf32> to tensor<2x48xf32>
-  return %0 : tensor<2x48xf32>
+  util.return %0 : tensor<2x48xf32>
 }
-// CHECK-LABEL: func.func @extract_slice2
+// CHECK-LABEL:  util.func public @extract_slice2
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<5x24x48xf32>)
 //   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -31,36 +31,36 @@
 //   CHECK-DAG:   %[[C48:.+]] = arith.constant 48 : index
 //       CHECK:   %[[SLICE:.+]] = flow.tensor.slice %[[ARG0]][%[[C2]], %[[C3]], %[[C0]] for %[[C1]], %[[C2]], %[[C48]]]
 //       CHECK:   %[[RESULT:.+]] = flow.tensor.reshape %[[SLICE]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @extract_slice3(%arg0 : tensor<5x24x48xf32>) -> tensor<2x24xf32> {
+ util.func public @extract_slice3(%arg0 : tensor<5x24x48xf32>) -> tensor<2x24xf32> {
   %0 = tensor.extract_slice %arg0[2, 3, 0] [1, 2, 24] [1, 1, 1]
       : tensor<5x24x48xf32> to tensor<2x24xf32>
-  return %0 : tensor<2x24xf32>
+  util.return %0 : tensor<2x24xf32>
 }
-// CHECK-LABEL: func.func @extract_slice3
+// CHECK-LABEL:  util.func public @extract_slice3
 //       CHECK:   tensor.extract_slice
 
 // -----
 
-func.func @extract_slice4(%arg0 : tensor<5x24x48xf32>, %arg1 : index) -> tensor<2x24xf32> {
+ util.func public @extract_slice4(%arg0 : tensor<5x24x48xf32>, %arg1 : index) -> tensor<2x24xf32> {
   %0 = tensor.extract_slice %arg0[2, 3, 0] [1, 2, 24] [1, %arg1, 1]
       : tensor<5x24x48xf32> to tensor<2x24xf32>
-  return %0 : tensor<2x24xf32>
+  util.return %0 : tensor<2x24xf32>
 }
-// CHECK-LABEL: func.func @extract_slice4
+// CHECK-LABEL:  util.func public @extract_slice4
 //       CHECK:   tensor.extract_slice
 
 // -----
 
-func.func @extract_slice5(%arg0 : tensor<5x24x48xf32>, %arg1 : index) -> tensor<2x48xf32> {
+ util.func public @extract_slice5(%arg0 : tensor<5x24x48xf32>, %arg1 : index) -> tensor<2x48xf32> {
   %0 = tensor.extract_slice %arg0[2, %arg1, 0] [1, 2, 48] [1, 1, 1]
       : tensor<5x24x48xf32> to tensor<2x48xf32>
-  return %0 : tensor<2x48xf32>
+  util.return %0 : tensor<2x48xf32>
 }
-// CHECK-LABEL: func.func @extract_slice5(
+// CHECK-LABEL:  util.func public @extract_slice5(
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<5x24x48xf32>
 //  CHECK-SAME:   %[[ARG1:.+]]: index)
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
@@ -69,16 +69,16 @@
 //   CHECK-DAG:   %[[C48:.+]] = arith.constant 48 : index
 //       CHECK:   %[[SLICE:.+]] = flow.tensor.slice %[[ARG0]][%[[C2]], %[[ARG1]], %[[C0]] for %[[C1]], %[[C2]], %[[C48]]]
 //       CHECK:   %[[RESULT:.+]] = flow.tensor.reshape %[[SLICE]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @extract_slice6(%arg0 : tensor<5x24x48xf32>, %arg1 : index) -> tensor<?x48xf32> {
+ util.func public @extract_slice6(%arg0 : tensor<5x24x48xf32>, %arg1 : index) -> tensor<?x48xf32> {
   %0 = tensor.extract_slice %arg0[2, 3, 0] [1, %arg1, 48] [1, 1, 1]
       : tensor<5x24x48xf32> to tensor<?x48xf32>
-  return %0 : tensor<?x48xf32>
+  util.return %0 : tensor<?x48xf32>
 }
-// CHECK-LABEL: func.func @extract_slice6(
+// CHECK-LABEL:  util.func public @extract_slice6(
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<5x24x48xf32>
 //  CHECK-SAME:   %[[ARG1:.+]]: index)
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
@@ -88,16 +88,16 @@
 //   CHECK-DAG:   %[[C48:.+]] = arith.constant 48 : index
 //       CHECK:   %[[SLICE:.+]] = flow.tensor.slice %[[ARG0]][%[[C2]], %[[C3]], %[[C0]] for %[[C1]], %[[ARG1]], %[[C48]]]
 //       CHECK:   %[[RESULT:.+]] = flow.tensor.reshape %[[SLICE]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @extract_slice7(%arg0 : tensor<5x?x48xf32>, %arg1 : index) -> tensor<2x48xf32> {
+ util.func public @extract_slice7(%arg0 : tensor<5x?x48xf32>, %arg1 : index) -> tensor<2x48xf32> {
   %0 = tensor.extract_slice %arg0[2, 3, 0] [1, 2, 48] [1, 1, 1]
       : tensor<5x?x48xf32> to tensor<2x48xf32>
-  return %0 : tensor<2x48xf32>
+  util.return %0 : tensor<2x48xf32>
 }
-// CHECK-LABEL: func.func @extract_slice7(
+// CHECK-LABEL:  util.func public @extract_slice7(
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<5x?x48xf32>
 //  CHECK-SAME:   %[[ARG1:.+]]: index)
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
@@ -108,15 +108,15 @@
 //   CHECK-DAG:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<5x?x48xf32>
 //       CHECK:   %[[SLICE:.+]] = flow.tensor.slice %[[ARG0]][%[[C2]], %[[C3]], %[[C0]] for %[[C1]], %[[C2]], %[[C48]]]
 //       CHECK:   %[[RESULT:.+]] = flow.tensor.reshape %[[SLICE]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @rank_reducing_extract_slice(%arg0: tensor<?x513xi32>) -> tensor<513xi32> {
+ util.func public @rank_reducing_extract_slice(%arg0: tensor<?x513xi32>) -> tensor<513xi32> {
   %0 = tensor.extract_slice %arg0[4, 0] [1, 513] [1, 1] : tensor<?x513xi32> to tensor<513xi32>
-  return %0 : tensor<513xi32>
+  util.return %0 : tensor<513xi32>
 }
-// CHECK-LABEL: func.func @rank_reducing_extract_slice
+// CHECK-LABEL:  util.func public @rank_reducing_extract_slice
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
@@ -127,16 +127,16 @@
 //  CHECK-SAME:       [%[[C4]], %[[C0]] for %[[C1]], %[[C513]]]
 //  CHECK-SAME:       : tensor<?x513xi32>{%[[DIM]]} -> tensor<1x513xi32>
 //       CHECK:   %[[RESHAPE:.+]] = flow.tensor.reshape %[[SLICE]] : tensor<1x513xi32> -> tensor<513xi32>
-//       CHECK:   return %[[RESHAPE]] : tensor<513xi32>
+//       CHECK:   util.return %[[RESHAPE]] : tensor<513xi32>
 
 // -----
 
-func.func @rank_reducing_extract_slice_trailing_unit_dims
+ util.func public @rank_reducing_extract_slice_trailing_unit_dims
    (%arg0 : tensor<1x50x20x1xf32>) -> tensor<49x20xf32> {
   %0 = tensor.extract_slice %arg0[0, 1, 0, 0] [1, 49, 20, 1] [1, 1, 1, 1] : tensor<1x50x20x1xf32> to tensor<49x20xf32>
-  return %0 : tensor<49x20xf32>
+  util.return %0 : tensor<49x20xf32>
 }
-// CHECK-LABEL: func.func @rank_reducing_extract_slice_trailing_unit_dims
+// CHECK-LABEL:  util.func public @rank_reducing_extract_slice_trailing_unit_dims
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //   CHECK-DAG:   %[[C49:.+]] = arith.constant 49 : index
@@ -146,7 +146,7 @@
 
 // -----
 
-func.func @extract_slice_within_dispatch_workgroups_not_converted() -> tensor<f32> {
+ util.func public @extract_slice_within_dispatch_workgroups_not_converted() -> tensor<f32> {
   %x = arith.constant 100 : index
   %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<f32>) = () {
     // CHECK: = tensor.extract_slice %[[source:.+]][2, 3, 4] [1, 1, 4] [1, 1, 1] : tensor<5x24x48xf32> to tensor<4xf32>
@@ -156,5 +156,5 @@
     "test.sink"(%2) : (tensor<4xf32>) -> ()
     flow.return
   }
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/fill.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/fill.mlir
index 18b49c8..2b50f17 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/fill.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/fill.mlir

@@ -1,17 +1,17 @@
 // RUN: iree-opt --iree-flow-convert-to-flow --split-input-file %s | FileCheck %s
 
-func.func @tensor_reshape(%arg0 : tensor<?x4x?x5x?x6xf32>, %arg1 : tensor<20x?x40xf32>)
+ util.func public @tensor_reshape(%arg0 : tensor<?x4x?x5x?x6xf32>, %arg1 : tensor<20x?x40xf32>)
     -> (tensor<?x5x?xf32>, tensor<5x4x?x4x2x4x5xf32>)
 {
   %0 = tensor.collapse_shape %arg0 [[0, 1, 2], [3], [4, 5]]
       : tensor<?x4x?x5x?x6xf32> into tensor<?x5x?xf32>
   %1 = tensor.expand_shape %arg1 [[0, 1], [2, 3], [4, 5, 6]]
       : tensor<20x?x40xf32> into tensor<5x4x?x4x2x4x5xf32>
-  return %0, %1 : tensor<?x5x?xf32>, tensor<5x4x?x4x2x4x5xf32>
+  util.return %0, %1 : tensor<?x5x?xf32>, tensor<5x4x?x4x2x4x5xf32>
 }
-// CHECK-LABEL: func.func @tensor_reshape
+// CHECK-LABEL:  util.func public @tensor_reshape
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x4x?x5x?x6xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<20x?x40xf32>
 //   CHECK-DAG:   %[[R0:.+]] = flow.tensor.reshape %[[ARG0]]
 //   CHECK-DAG:   %[[R1:.+]] = flow.tensor.reshape %[[ARG1]]
-//       CHECK:   return %[[R0]], %[[R1]]
+//       CHECK:   util.return %[[R0]], %[[R1]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/from_elements.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/from_elements.mlir
index f74dd18..a32e890 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/from_elements.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/from_elements.mlir

@@ -1,28 +1,28 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-convert-to-flow %s | FileCheck %s
 
-// CHECK: func.func @tensor.from_elements__to__flow.tensor.splat(%[[arg0:.*]]: i8)
-func.func @tensor.from_elements__to__flow.tensor.splat(%arg0: i8) -> (i8) {
+// CHECK:  util.func public @tensor.from_elements__to__flow.tensor.splat(%[[arg0:.*]]: i8)
+ util.func public @tensor.from_elements__to__flow.tensor.splat(%arg0: i8) -> (i8) {
   // CHECK: %[[splat_res:.*]] = flow.tensor.splat %[[arg0]] : tensor<1xi8>
   %0 = tensor.from_elements %arg0 : tensor<1xi8>
   // CHECK: flow.tensor.load %[[splat_res]]
   %1 = flow.tensor.load %0 : tensor<1xi8>
-  return %1 : i8
+  util.return %1 : i8
 }
 
 // -----
-// CHECK: func.func @tensor.from_elements__not_convertible(%[[arg0:.*]]: i8)
-func.func @tensor.from_elements__not_convertible(%arg0: i8) -> (i8) {
+// CHECK:  util.func public @tensor.from_elements__not_convertible(%[[arg0:.*]]: i8)
+ util.func public @tensor.from_elements__not_convertible(%arg0: i8) -> (i8) {
   // CHECK: %[[c0:.*]] = arith.constant 0
   %c0 = arith.constant 0 : index
   // CHECK: %[[res:.*]] = tensor.from_elements %[[arg0]], %[[arg0]] : tensor<2xi8>
   %0 = tensor.from_elements %arg0, %arg0 : tensor<2xi8>
   // CHECK: flow.tensor.load %[[res]][%[[c0]]]
   %1 = flow.tensor.load %0[%c0] : tensor<2xi8>
-  return %1 : i8
+  util.return %1 : i8
 }
 
 // -----
-func.func @tensor.from_elements__within_dispatch_workgroups_not_converted() -> tensor<f32> {
+ util.func public @tensor.from_elements__within_dispatch_workgroups_not_converted() -> tensor<f32> {
   %x = arith.constant 100 : index
   %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<f32>) = () {
     // CHECK: = tensor.from_elements %[[source:.+]] : tensor<1xi8>
@@ -31,16 +31,16 @@
     "test.sink"(%2) : (tensor<1xi8>) -> ()
     flow.return
   }
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
 // -----
 
-func.func @tensor.from_elements_0D(%arg0 : f32) -> tensor<f32> {
+ util.func public @tensor.from_elements_0D(%arg0 : f32) -> tensor<f32> {
   %0 = tensor.from_elements %arg0 : tensor<f32>
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
-//      CHECK: func.func @tensor.from_elements_0D
+//      CHECK:  util.func public @tensor.from_elements_0D
 // CHECK-SAME:     %[[ARG0:.+]]: f32
 //      CHECK:   %[[SPLAT:.+]] = flow.tensor.splat %[[ARG0]] : tensor<f32>
-//      CHECK:   return %[[SPLAT]]
+//      CHECK:   util.return %[[SPLAT]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/insert.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/insert.mlir
index b63497c..a83fc1f 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/insert.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/insert.mlir

@@ -1,26 +1,26 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-convert-to-flow %s | FileCheck %s
 
-func.func @insert_convert_zero_ranked_tensor
+ util.func public @insert_convert_zero_ranked_tensor
     (%arg0 : tensor<i64>) -> tensor<i64> {
   %c0_i64 = arith.constant 0 : i64
   %0 = tensor.insert %c0_i64 into %arg0[] : tensor<i64>
-  return %0 : tensor<i64>
+  util.return %0 : tensor<i64>
 }
-// CHECK-LABEL: func.func @insert_convert_zero_ranked_tensor
+// CHECK-LABEL:  util.func public @insert_convert_zero_ranked_tensor
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
 //   CHECK-DAG:   %[[C0_I64:.+]] = arith.constant 0 : i64
 //       CHECK:   %[[UPDATE:.+]] = flow.tensor.store %[[C0_I64]], %[[ARG0]] : tensor<i64>
 
 // -----
 
-func.func @insert_convert
+ util.func public @insert_convert
     (%arg0 : tensor<2x3xi64>) -> tensor<2x3xi64> {
   %c0 = arith.constant 0 : index
   %c0_i64 = arith.constant 0 : i64
   %0 = tensor.insert %c0_i64 into %arg0[%c0, %c0] : tensor<2x3xi64>
-  return %0 : tensor<2x3xi64>
+  util.return %0 : tensor<2x3xi64>
 }
-// CHECK-LABEL: func.func @insert_convert
+// CHECK-LABEL:  util.func public @insert_convert
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C0_I64:.+]] = arith.constant 0 : i64
@@ -28,14 +28,14 @@
 
 // -----
 
-func.func @insert_convert_dynamic_dims
+ util.func public @insert_convert_dynamic_dims
     (%arg0 : tensor<?x3xi64>) -> tensor<?x3xi64> {
   %c0 = arith.constant 0 : index
   %c0_i64 = arith.constant 0 : i64
   %0 = tensor.insert %c0_i64 into %arg0[%c0, %c0] : tensor<?x3xi64>
-  return %0 : tensor<?x3xi64>
+  util.return %0 : tensor<?x3xi64>
 }
-// CHECK-LABEL: func.func @insert_convert_dynamic_dims
+// CHECK-LABEL:  util.func public @insert_convert_dynamic_dims
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C0_I64:.+]] = arith.constant 0 : i64
@@ -44,7 +44,7 @@
 
 // -----
 
-func.func @insert_within_dispatch_workgroups_not_converted() -> tensor<f32> {
+ util.func public @insert_within_dispatch_workgroups_not_converted() -> tensor<f32> {
   %x = arith.constant 100 : index
   %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<f32>) = () {
     %c0 = arith.constant 0 : index
@@ -55,5 +55,5 @@
     "test.sink"(%2) : (tensor<2x3xi64>) -> ()
     flow.return
   }
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/insert_slice.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/insert_slice.mlir
index 6f96112..662ab69 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/insert_slice.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/insert_slice.mlir

@@ -1,14 +1,14 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file --iree-flow-convert-to-flow %s | FileCheck %s
 
-func.func @insert_slice_convert
+ util.func public @insert_slice_convert
     (%arg0 : tensor<?x24x48xf32>, %arg1 : tensor<1x4x48xf32>) ->
     tensor<?x24x48xf32> {
   %c0 = arith.constant 0 : index
   %0 = tensor.insert_slice %arg1 into %arg0[4, 2, 0] [1, 4, 48] [1, 1, 1] :
       tensor<1x4x48xf32> into tensor<?x24x48xf32>
-  return %0 : tensor<?x24x48xf32>
+  util.return %0 : tensor<?x24x48xf32>
 }
-// CHECK-LABEL: func.func @insert_slice_convert
+// CHECK-LABEL:  util.func public @insert_slice_convert
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0
@@ -20,15 +20,15 @@
 
 // -----
 
-func.func @insert_slice_convert_rank_reducing
+ util.func public @insert_slice_convert_rank_reducing
     (%arg0 : tensor<?x24x48xf32>, %arg1 : tensor<4x48xf32>) ->
     tensor<?x24x48xf32> {
   %c0 = arith.constant 0 : index
   %0 = tensor.insert_slice %arg1 into %arg0[4, 2, 0] [1, 4, 48] [1, 1, 1] :
       tensor<4x48xf32> into tensor<?x24x48xf32>
-  return %0 : tensor<?x24x48xf32>
+  util.return %0 : tensor<?x24x48xf32>
 }
-// CHECK-LABEL: func.func @insert_slice_convert_rank_reducing
+// CHECK-LABEL:  util.func public @insert_slice_convert_rank_reducing
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0
@@ -41,12 +41,12 @@
 
 // -----
 
-func.func @rank_reducing_insert_slice_trailing_unit_dims
+ util.func public @rank_reducing_insert_slice_trailing_unit_dims
    (%arg0 : tensor<49x20xf32>, %arg1 : tensor<1x50x20x1xf32>) -> tensor<1x50x20x1xf32> {
   %0 = tensor.insert_slice %arg0 into %arg1[0, 1, 0, 0] [1, 49, 20, 1] [1, 1, 1, 1] : tensor<49x20xf32> into tensor<1x50x20x1xf32>
-  return %0 : tensor<1x50x20x1xf32>
+  util.return %0 : tensor<1x50x20x1xf32>
 }
-// CHECK-LABEL: func.func @rank_reducing_insert_slice_trailing_unit_dims
+// CHECK-LABEL:  util.func public @rank_reducing_insert_slice_trailing_unit_dims
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //       CHECK:   %[[RESHAPE:.+]] = flow.tensor.reshape %{{.+}} : tensor<49x20xf32> -> tensor<1x49x20x1xf32>
@@ -55,8 +55,8 @@
 
 // -----
 
-// CHECK-LABEL: func.func @insert_slice_within_dispatch_workgroups_not_converted
-func.func @insert_slice_within_dispatch_workgroups_not_converted() -> tensor<f32> {
+// CHECK-LABEL:  util.func public @insert_slice_within_dispatch_workgroups_not_converted
+ util.func public @insert_slice_within_dispatch_workgroups_not_converted() -> tensor<f32> {
   %x = arith.constant 100 : index
   %0 = flow.dispatch.workgroups[%x]() : () -> (tensor<f32>) = () {
     // CHECK: = tensor.insert_slice %[[source2:.+]] into %[[source1:.+]][4, 2, 0] [1, 4, 48] [1, 1, 1] : tensor<1x4x48xf32> into tensor<?x24x48xf32>
@@ -67,19 +67,19 @@
     "test.sink"(%3) : (tensor<?x24x48xf32>) -> ()
     flow.return
   }
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
 // -----
 
-func.func @insert_slice_convert_dynamic_offset_and_size
+ util.func public @insert_slice_convert_dynamic_offset_and_size
     (%target: tensor<?x24x48xf32>, %slice: tensor<1x?x48xf32>, %offset: index, %size: index) ->
     tensor<?x24x48xf32> {
   %0 = tensor.insert_slice %slice into %target[%offset, 2, 0] [1, %size, 48] [1, 1, 1] :
       tensor<1x?x48xf32> into tensor<?x24x48xf32>
-  return %0 : tensor<?x24x48xf32>
+  util.return %0 : tensor<?x24x48xf32>
 }
-// CHECK-LABEL: func.func @insert_slice_convert_dynamic_offset_and_size
+// CHECK-LABEL:  util.func public @insert_slice_convert_dynamic_offset_and_size
 //  CHECK-SAME:   %[[TARGET:[a-zA-Z0-9_]+]]
 //  CHECK-SAME:   %[[SLICE:[a-zA-Z0-9_]+]]
 //  CHECK-SAME:   %[[OFFSET:[a-zA-Z0-9_]+]]
@@ -92,8 +92,8 @@
 
 // -----
 
-// CHECK-LABEL: func.func @insert_slice_dynamic_tensor_result_not_converted
-func.func @insert_slice_dynamic_tensor_result_not_converted
+// CHECK-LABEL:  util.func public @insert_slice_dynamic_tensor_result_not_converted
+ util.func public @insert_slice_dynamic_tensor_result_not_converted
     (%arg0: tensor<?x24x48xf32>, %arg1: tensor<1x4x48xf32>, %offset: index) ->
     tensor<?x24x48xf32> {
   %x = arith.constant 100 : index
@@ -106,6 +106,6 @@
   // CHECK: %[[INSERTED_TENSOR:.+]] = tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, 2, 0] [1, 4, 48] [1, 1, 1]
   %2 = tensor.insert_slice %arg1 into %arg0[%idx, 2, 0] [1, 4, 48] [1, 1, 1] :
       tensor<1x4x48xf32> into tensor<?x24x48xf32>
-  // CHECK: return %[[INSERTED_TENSOR]] : tensor<?x24x48xf32>
-  return %2 : tensor<?x24x48xf32>
+  // CHECK: util.return %[[INSERTED_TENSOR]] : tensor<?x24x48xf32>
+  util.return %2 : tensor<?x24x48xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/reshape.mlir b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/reshape.mlir
index cfc50ab..c85bd56 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/reshape.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow/test/reshape.mlir

@@ -1,6 +1,6 @@
 // RUN: iree-opt --iree-flow-convert-to-flow --split-input-file %s | FileCheck %s
 
-func.func @turn_fill_into_splat(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> tensor<?x?xf32> {
+ util.func public @turn_fill_into_splat(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %0 = tensor.extract %arg1[] : tensor<f32>
@@ -11,11 +11,11 @@
   %5 = tensor.empty(%3, %4) : tensor<?x?xf32>
   %6 = linalg.fill ins(%0 : f32) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %7 = flow.tensor.update %arg0, %6[%arg2, %arg3] : tensor<?x?xf32>{%1, %2} -> %6 as tensor<?x?xf32>{%3, %4}
-  return %7 : tensor<?x?xf32>
+  util.return %7 : tensor<?x?xf32>
 }
 
 //       CHECK: #[[MAP:.+]] = affine_map<(d0)[s0, s1] -> (d0 + s0 + s1)>
-//       CHECK: func.func @turn_fill_into_splat
+//       CHECK:  util.func public @turn_fill_into_splat
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: tensor<f32>
 //  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
@@ -34,17 +34,17 @@
 
 // -----
 
-func.func @static_tensor_reshape(%arg0: tensor<2x4xf32>, %arg1: tensor<2xindex>) -> tensor<1x8xf32> {
+ util.func public @static_tensor_reshape(%arg0: tensor<2x4xf32>, %arg1: tensor<2xindex>) -> tensor<1x8xf32> {
   // CHECK-DAG: %[[RESULT:.*]] = flow.tensor.reshape %arg0 : tensor<2x4xf32> -> tensor<1x8xf32>
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.reshape %arg0(%arg1)
              : (tensor<2x4xf32>, tensor<2xindex>) -> tensor<1x8xf32>
-  return %0 : tensor<1x8xf32> }
+  util.return %0 : tensor<1x8xf32> }
 
 // -----
 
-  func.func @dynamic_tensor_reshape(%arg0: tensor<2x4xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
-  //      CHECK: func.func @dynamic_tensor_reshape
+   util.func public @dynamic_tensor_reshape(%arg0: tensor<2x4xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
+  //      CHECK:  util.func public @dynamic_tensor_reshape
   // CHECK-SAME:     %[[ARG0:.+]]: tensor<2x4xf32>
   // CHECK-SAME:     %[[ARG1:.+]]: tensor<2xindex>
   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
@@ -52,29 +52,29 @@
   // CHECK-DAG: %[[VAL:.+]] = flow.tensor.load %[[ARG1]][%[[C0]]] : tensor<2xindex>
   // CHECK-DAG: %[[VAL1:.+]] = flow.tensor.load %[[ARG1]][%[[C1]]] : tensor<2xindex>
   // CHECK-DAG: %[[RESULT:.*]] = flow.tensor.reshape %[[ARG0]] : tensor<2x4xf32> -> tensor<?x?xf32>{%[[VAL]], %[[VAL1]]}
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.reshape %arg0(%arg1)
              : (tensor<2x4xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32> }
+  util.return %0 : tensor<?x?xf32> }
 
   // -----
 
-  func.func @mix_dynamic_and_static_tensor_reshape(%arg0: tensor<2x4xf32>, %arg1: tensor<2xindex>) -> tensor<1x?xf32> {
-  //      CHECK: func.func @mix_dynamic_and_static_tensor_reshape
+   util.func public @mix_dynamic_and_static_tensor_reshape(%arg0: tensor<2x4xf32>, %arg1: tensor<2xindex>) -> tensor<1x?xf32> {
+  //      CHECK:  util.func public @mix_dynamic_and_static_tensor_reshape
   // CHECK-SAME:     %[[ARG0:.+]]: tensor<2x4xf32>
   // CHECK-SAME:     %[[ARG1:.+]]: tensor<2xindex>
   // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
   // CHECK-DAG: %[[VAL:.+]] = flow.tensor.load %[[ARG1]][%[[C1]]] : tensor<2xindex>
   // CHECK-DAG: %[[RESULT:.*]] = flow.tensor.reshape %[[ARG0]] : tensor<2x4xf32> -> tensor<1x?xf32>{%[[VAL]]}
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.reshape %arg0(%arg1)
              : (tensor<2x4xf32>, tensor<2xindex>) -> tensor<1x?xf32>
-  return %0 : tensor<1x?xf32> }
+  util.return %0 : tensor<1x?xf32> }
 
   // -----
 
-  func.func @dynamic_input_and_output_tensor_reshape(%arg0: tensor<?x4xf32>, %arg1: tensor<2xindex>) -> tensor<1x?xf32> {
-  //      CHECK: func.func @dynamic_input_and_output_tensor_reshape
+   util.func public @dynamic_input_and_output_tensor_reshape(%arg0: tensor<?x4xf32>, %arg1: tensor<2xindex>) -> tensor<1x?xf32> {
+  //      CHECK:  util.func public @dynamic_input_and_output_tensor_reshape
   // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x4xf32>
   // CHECK-SAME:     %[[ARG1:.+]]: tensor<2xindex>
   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
@@ -82,18 +82,18 @@
   // CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x4xf32>
   // CHECK-DAG: %[[VAL:.+]] = flow.tensor.load %[[ARG1]][%[[C1]]] : tensor<2xindex>
   // CHECK-DAG: %[[RESULT:.*]] = flow.tensor.reshape %[[ARG0]] : tensor<?x4xf32>{%[[D0]]} -> tensor<1x?xf32>{%[[VAL]]}
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.reshape %arg0(%arg1)
              : (tensor<?x4xf32>, tensor<2xindex>) -> tensor<1x?xf32>
-  return %0 : tensor<1x?xf32> }
+  util.return %0 : tensor<1x?xf32> }
 
   // -----
-  func.func @from_elements_test_reshape(%arg0: tensor<?x4xf32>, %arg1: index, %arg2: index) -> tensor<?x1xf32> {
+   util.func public @from_elements_test_reshape(%arg0: tensor<?x4xf32>, %arg1: index, %arg2: index) -> tensor<?x1xf32> {
   // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
   // CHECK-DAG: %[[D1:.*]] = tensor.dim %arg0, %[[C0:.*]] : tensor<?x4xf32>
   // CHECK-DAG: %[[RESULT:.*]] = flow.tensor.reshape %arg0 : tensor<?x4xf32>{%[[D1]]} -> tensor<?x1xf32>{%arg1}
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.from_elements %arg1, %arg2 : tensor<2xindex>
   %1 = tensor.reshape %arg0(%0)
              : (tensor<?x4xf32>, tensor<2xindex>) -> tensor<?x1xf32>
-  return %1 : tensor<?x1xf32> }
+  util.return %1 : tensor<?x1xf32> }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index 6266615..6ad3b9c 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp

@@ -1510,8 +1510,10 @@
   state.addAttribute("function_type", TypeAttr::get(type));
   state.attributes.append(attrs.begin(), attrs.end());
   state.attributes.erase(IREE::Util::TiedOpInterface::getStorageAttrName());
-  state.addAttribute(IREE::Util::TiedOpInterface::getStorageAttrName(),
-                     tiedOperands);
+  if (tiedOperands) {
+    state.addAttribute(IREE::Util::TiedOpInterface::getStorageAttrName(),
+                       tiedOperands);
+  }
   state.addRegion();
   if (!argAttrs.empty() || !resAttrs.empty()) {
     assert(type.getNumInputs() == argAttrs.size());
@@ -1538,8 +1540,10 @@
   state.addOperands(resultDims);
   state.addAttributes(attributes);
   state.attributes.erase(IREE::Util::TiedOpInterface::getStorageAttrName());
-  state.addAttribute(IREE::Util::TiedOpInterface::getStorageAttrName(),
-                     tiedOperands);
+  if (tiedOperands) {
+    state.addAttribute(IREE::Util::TiedOpInterface::getStorageAttrName(),
+                       tiedOperands);
+  }
   state.attributes.erase(getOperandSegmentSizeAttr());
   state.addAttribute(getOperandSegmentSizeAttr(),
                      builder.getDenseI32ArrayAttr({

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/call_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/call_ops.mlir
index 3bc7761..8f94447 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/call_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/call_ops.mlir

@@ -41,14 +41,14 @@
 
 // CHECK-LABEL: @basicCall
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?xf32>)
-func.func @basicCall(%arg0: tensor<?xf32>) -> (tensor<?xf32>, i32) {
+util.func public @basicCall(%arg0: tensor<?xf32>) -> (tensor<?xf32>, i32) {
   %c0 = arith.constant 0 : index
   // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %c0
   %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
   // CHECK: %[[CALL:.+]]:2 = flow.call @basicExtern(%[[ARG0]], %[[DIM]]) : (tensor<?xf32>{%[[DIM]]}, index) -> (tensor<?xf32>{%[[DIM]]}, i32)
   %call:2 = flow.call @basicExtern(%arg0, %dim) : (tensor<?xf32>{%dim}, index) -> (tensor<?xf32>{%dim}, i32)
-  // CHECK: return %[[CALL]]#0, %[[CALL]]#1
-  return %call#0, %call#1 : tensor<?xf32>, i32
+  // CHECK: util.return %[[CALL]]#0, %[[CALL]]#1
+  util.return %call#0, %call#1 : tensor<?xf32>, i32
 }
 
 // -----
@@ -59,14 +59,14 @@
 
 // CHECK-LABEL: @inplaceCall
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?xf32>)
-func.func @inplaceCall(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+util.func public @inplaceCall(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %c0
   %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
   // CHECK: %[[CALL:.+]] = flow.call @inplaceExtern(%[[ARG0]], %[[DIM]]) : (tensor<?xf32>{%[[DIM]]}, index) -> %[[ARG0]]{%[[DIM]]}
   %call = flow.call @inplaceExtern(%arg0, %dim) : (tensor<?xf32>{%dim}, index) -> %arg0{%dim}
-  // CHECK: return %[[CALL]]
-  return %call : tensor<?xf32>
+  // CHECK: util.return %[[CALL]]
+  util.return %call : tensor<?xf32>
 }
 
 // -----
@@ -77,12 +77,12 @@
 
 // CHECK-LABEL: @inplaceTypeChangeCall
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x4xf32>)
-func.func @inplaceTypeChangeCall(%arg0: tensor<?x4xf32>) -> tensor<4x?xi32> {
+util.func public @inplaceTypeChangeCall(%arg0: tensor<?x4xf32>) -> tensor<4x?xi32> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[DIM:.+]] = tensor.dim %[[ARG0]], %c0
   %dim = tensor.dim %arg0, %c0 : tensor<?x4xf32>
   // CHECK: %[[CALL:.+]] = flow.call @inplaceTypeChangeExtern(%[[ARG0]], %[[DIM]]) : (tensor<?x4xf32>{%[[DIM]]}, index) -> %[[ARG0]] as tensor<4x?xi32>{%[[DIM]]}
   %call = flow.call @inplaceTypeChangeExtern(%arg0, %dim) : (tensor<?x4xf32>{%dim}, index) -> %arg0 as tensor<4x?xi32>{%dim}
-  // CHECK: return %[[CALL]]
-  return %call : tensor<4x?xi32>
+  // CHECK: util.return %[[CALL]]
+  util.return %call : tensor<4x?xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_ops.mlir
index 7d0138e..1c69b13 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_ops.mlir

@@ -3,19 +3,19 @@
 flow.executable @ex0 {
   flow.executable.export @dispatch_fn
   builtin.module {
-    func.func @dispatch_fn(%cst : index, %arg0 : tensor<4xf32>) -> tensor<4xf32> {
-      return %arg0 : tensor<4xf32>
+    util.func public @dispatch_fn(%cst : index, %arg0 : tensor<4xf32>) -> tensor<4xf32> {
+      util.return %arg0 : tensor<4xf32>
     }
   }
 }
 
 // CHECK-LABEL: @dispatch
-func.func @dispatch(%arg0 : tensor<4xf32>) -> tensor<4xf32> {
+util.func public @dispatch(%arg0 : tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: %[[CST:.+]] = arith.constant
   %cst = arith.constant 4 : index
   // CHECK: %0 = flow.dispatch @ex0::@dispatch_fn[%[[CST]]](%[[CST]], %arg0) : (index, tensor<4xf32>) -> tensor<4xf32>
   %0 = flow.dispatch @ex0::@dispatch_fn[%cst](%cst, %arg0) : (index, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
@@ -26,10 +26,10 @@
 }
 
 // CHECK-LABEL: @dispatchWithMultipleRefs
-func.func @dispatchWithMultipleRefs(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+util.func public @dispatchWithMultipleRefs(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: = flow.dispatch {@ex0::@dispatch_a, @ex0::@dispatch_b}(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   %0 = flow.dispatch {@ex0::@dispatch_a, @ex0::@dispatch_b}(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
 
@@ -42,12 +42,12 @@
 }
 
 // CHECK-LABEL: @dispatchWithWorkgroupCount
-func.func @dispatchWithWorkgroupCount(%arg0: tensor<4xf32>, %arg1: index) -> tensor<4xf32> {
+util.func public @dispatchWithWorkgroupCount(%arg0: tensor<4xf32>, %arg1: index) -> tensor<4xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   // CHECK: = flow.dispatch @ex0::@dispatch[%c1, %c2](%arg0, %arg1) : (tensor<4xf32>, index) -> tensor<4xf32>
   %0 = flow.dispatch @ex0::@dispatch[%c1, %c2](%arg0, %arg1) : (tensor<4xf32>, index) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
@@ -58,40 +58,40 @@
   }
 }
 
-func.func @dispatchWithInvalidWorkload(%arg0: tensor<4xf32>, %arg1: index) -> tensor<4xf32> {
+util.func public @dispatchWithInvalidWorkload(%arg0: tensor<4xf32>, %arg1: index) -> tensor<4xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   // expected-error @+1 {{op workload mismatch; entry point expects 1 arguments but dispatch provides 2}}
   %0 = flow.dispatch @ex0::@dispatch[%c1, %c2](%arg0, %arg1) : (tensor<4xf32>, index) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @dispatchNoWorkload
-func.func @dispatchNoWorkload(%arg0 : tensor<4xf32>) -> tensor<4xf32> {
+util.func public @dispatchNoWorkload(%arg0 : tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: %[[CST:.+]] = arith.constant
   %cst = arith.constant 4 : index
   // CHECK: %0 = flow.dispatch @ex0::@dispatch_fn(%[[CST]], %arg0) : (index, tensor<4xf32>) -> tensor<4xf32>
   %0 = flow.dispatch @ex0::@dispatch_fn(%cst, %arg0) : (index, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @inplaceDispatch
-func.func @inplaceDispatch(%arg0 : tensor<4xf32>, %arg1 : tensor<8xf32>) -> (tensor<4xf32>, tensor<8xf32>) {
+util.func public @inplaceDispatch(%arg0 : tensor<4xf32>, %arg1 : tensor<8xf32>) -> (tensor<4xf32>, tensor<8xf32>) {
   // CHECK: %[[CST:.+]] = arith.constant
   %cst = arith.constant 4 : index
   // CHECK: %0:2 = flow.dispatch @ex0::@dispatch_fn[%[[CST]]](%[[CST]], %arg0, %arg1) : (index, tensor<4xf32>, tensor<8xf32>) -> (%arg0, %arg1)
   %0, %1 = flow.dispatch @ex0::@dispatch_fn[%cst](%cst, %arg0, %arg1) : (index, tensor<4xf32>, tensor<8xf32>) -> (%arg0, %arg1)
-  return %0, %1 : tensor<4xf32>, tensor<8xf32>
+  util.return %0, %1 : tensor<4xf32>, tensor<8xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @inplaceDynamicDispatch
-func.func @inplaceDynamicDispatch(%arg0 : tensor<4x?xf32>, %arg1 : tensor<8x?xf32>) -> (tensor<4x?xf32>, tensor<8x?xf32>) {
+util.func public @inplaceDynamicDispatch(%arg0 : tensor<4x?xf32>, %arg1 : tensor<8x?xf32>) -> (tensor<4x?xf32>, tensor<8x?xf32>) {
   // CHECK-DAG: %[[CST:.+]] = arith.constant 4
   %cst = arith.constant 4 : index
   // CHECK-DAG: %[[DIM0:.+]] = arith.constant 100
@@ -100,28 +100,28 @@
   %dim1 = arith.constant 200 : index
   // CHECK: %0:2 = flow.dispatch @ex0::@dispatch_fn[%[[CST]]](%[[CST]], %arg0, %arg1) : (index, tensor<4x?xf32>{%[[DIM0]]}, tensor<8x?xf32>{%[[DIM1]]}) -> (%arg0{%[[DIM1]]}, %arg1{%[[DIM0]]})
   %0, %1 = flow.dispatch @ex0::@dispatch_fn[%cst](%cst, %arg0, %arg1) : (index, tensor<4x?xf32>{%dim0}, tensor<8x?xf32>{%dim1}) -> (%arg0{%dim1}, %arg1{%dim0})
-  return %0, %1 : tensor<4x?xf32>, tensor<8x?xf32>
+  util.return %0, %1 : tensor<4x?xf32>, tensor<8x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @inplaceTypeChange
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4x?xf32>)
-func.func @inplaceTypeChange(%arg0: tensor<4x?xf32>) -> tensor<?x4xf32> {
+util.func public @inplaceTypeChange(%arg0: tensor<4x?xf32>) -> tensor<?x4xf32> {
   // CHECK-DAG: %[[CST:.+]] = arith.constant 4
   %cst = arith.constant 4 : index
   // CHECK-DAG: %[[DIM0:.+]] = arith.constant 100
   %dim0 = arith.constant 100 : index
   // CHECK: %0 = flow.dispatch @ex0::@dispatch_fn[%[[CST]]](%[[ARG0]]) : (tensor<4x?xf32>{%[[DIM0]]}) -> %arg0 as tensor<?x4xf32>{%[[DIM0]]}
   %0 = flow.dispatch @ex0::@dispatch_fn[%cst](%arg0) : (tensor<4x?xf32>{%dim0}) -> %arg0 as tensor<?x4xf32>{%dim0}
-  return %0 : tensor<?x4xf32>
+  util.return %0 : tensor<?x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @region
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>)
-func.func @region(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @region(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // CHECK: %[[R:.*]] = flow.dispatch.region -> (tensor<?x?xf32>{%{{.*}}, %{{.*}}}) {
   // CHECK:   flow.return %[[ARG0]] : tensor<?x?xf32>
   // CHECK: }
@@ -132,30 +132,30 @@
   %r = flow.dispatch.region -> (tensor<?x?xf32>{%d0, %d1}) {
     flow.return %arg0 : tensor<?x?xf32>
   }
-  // CHECK: return %[[R]]
-  return %r : tensor<?x?xf32>
+  // CHECK: util.return %[[R]]
+  util.return %r : tensor<?x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @regionStaticShape
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<5x10xf32>)
-func.func @regionStaticShape(%arg0: tensor<5x10xf32>) -> tensor<5x10xf32> {
+util.func public @regionStaticShape(%arg0: tensor<5x10xf32>) -> tensor<5x10xf32> {
   // CHECK: %[[R:.*]] = flow.dispatch.region -> (tensor<5x10xf32>) {
   // CHECK:   flow.return %[[ARG0]] : tensor<5x10xf32>
   // CHECK: }
   %r = flow.dispatch.region -> (tensor<5x10xf32>) {
     flow.return %arg0 : tensor<5x10xf32>
   }
-  // CHECK: return %[[R]]
-  return %r : tensor<5x10xf32>
+  // CHECK: util.return %[[R]]
+  util.return %r : tensor<5x10xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @regionDynamicShape
+// CHECK-LABEL: util.func public @regionDynamicShape
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?x16xf32>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index, %[[DIM2:.+]]: index, %[[DIM3:.+]]: index)
-func.func @regionDynamicShape(%arg0: tensor<?x?x16xf32>, %dim0: index, %dim1: index, %dim2: index, %dim3: index) -> tensor<?x?x16xf32> {
+util.func public @regionDynamicShape(%arg0: tensor<?x?x16xf32>, %dim0: index, %dim1: index, %dim2: index, %dim3: index) -> tensor<?x?x16xf32> {
   // CHECK: %[[C16:.+]] = arith.constant 16 : index
   %c16 = arith.constant 16 : index
   // CHECK: %[[R:.+]] = flow.dispatch.region[%[[DIM0]], %[[DIM1]], %[[C16]]] -> (tensor<?x?x16xf32>{%[[DIM2]], %[[DIM3]]}) {
@@ -164,6 +164,6 @@
   %region = flow.dispatch.region[%dim0, %dim1, %c16] -> (tensor<?x?x16xf32>{%dim2, %dim3}) {
     flow.return %arg0 : tensor<?x?x16xf32>
   }
-  // CHECK: return %[[R]]
-  return %region: tensor<?x?x16xf32>
+  // CHECK: util.return %[[R]]
+  util.return %region: tensor<?x?x16xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_tensor_folding.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_tensor_folding.mlir
index 9d8a4f4..2927f3f 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_tensor_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_tensor_folding.mlir

@@ -1,24 +1,24 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file --canonicalize %s | FileCheck %s
 
 // CHECK-LABEL: @ReuseDispatchTensorLoadShapeDims
-func.func @ReuseDispatchTensorLoadShapeDims(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) {
+util.func public @ReuseDispatchTensorLoadShapeDims(%arg0: !flow.dispatch.tensor<readonly:tensor<?x?xf32>>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) {
   %arg0_tied = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg1, %arg2}
   %c0 = arith.constant 0 : index
   // CHECK: flow.dispatch.tensor.load {{.+}} !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg1, %arg2}
   %0 = flow.dispatch.tensor.load %arg0_tied, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%arg3, %arg4} -> tensor<256x1024xf32>
   "test.sink"(%0) : (tensor<256x1024xf32>) -> ()
-  return
+  util.return
 }
 
 // -----
 
-func.func @canonicalizeStaticOperands(%arg0: !flow.dispatch.tensor<readonly:tensor<4x4xf32>>) {
+util.func public @canonicalizeStaticOperands(%arg0: !flow.dispatch.tensor<readonly:tensor<4x4xf32>>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %0 = flow.dispatch.tensor.load %arg0, offsets=[%c0, %c0], sizes=[%c2, %c2], strides=[%c1, %c1] : !flow.dispatch.tensor<readonly:tensor<4x4xf32>> -> tensor<?x?xf32>
   "test.sink"(%0) : (tensor<?x?xf32>) -> ()
-  return
+  util.return
 }
 
 // CHECK: @canonicalizeStaticOperand
@@ -31,13 +31,13 @@
 
 // -----
 
-func.func @canonicalizePartiallyStaticOperands(%arg0: !flow.dispatch.tensor<readonly:tensor<4x4xf32>>, %offset: index, %size: index, %stride: index) {
+util.func public @canonicalizePartiallyStaticOperands(%arg0: !flow.dispatch.tensor<readonly:tensor<4x4xf32>>, %offset: index, %size: index, %stride: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %0 = flow.dispatch.tensor.load %arg0, offsets=[%offset, %c0], sizes=[%size, %c2], strides=[%stride, %c1] : !flow.dispatch.tensor<readonly:tensor<4x4xf32>> -> tensor<?x?xf32>
   "test.sink"(%0) : (tensor<?x?xf32>) -> ()
-  return
+  util.return
 }
 
 // CHECK: @canonicalizePartiallyStaticOperands
@@ -51,7 +51,7 @@
 
 // -----
 
-func.func @canonicalizeDispatchLoad(%arg0: !flow.dispatch.tensor<readonly:tensor<3x4x1x12x64xf32>>, %arg1 : index, %arg2: index, %arg3 : index) {
+util.func public @canonicalizeDispatchLoad(%arg0: !flow.dispatch.tensor<readonly:tensor<3x4x1x12x64xf32>>, %arg1 : index, %arg2: index, %arg3 : index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %0 = flow.dispatch.tensor.load %arg0, offsets = [%arg1, %c0, 0, %arg2, %arg3], sizes = [1, 4, 1, 4, 32], strides = [%c1, %c1, 1, %c1, %c1] : !flow.dispatch.tensor<readonly:tensor<3x4x1x12x64xf32>> -> tensor<1x4x?x32xf32>
@@ -67,13 +67,13 @@
 
 // -----
 
-func.func @canonicalizeDimOfTensorTile(%arg0: !flow.dispatch.tensor<readonly:tensor<250x1024xf32>>, %arg1 : index, %arg2: index) {
+util.func public @canonicalizeDimOfTensorTile(%arg0: !flow.dispatch.tensor<readonly:tensor<250x1024xf32>>, %arg1 : index, %arg2: index) {
   %c0 = arith.constant 0 : index
   %0 = affine.min affine_map<(d0) -> (64, -d0 + 250)>(%arg1)
   %1 = flow.dispatch.tensor.load %arg0, offsets = [%arg2, 0], sizes = [%0, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<250x1024xf32>> -> tensor<?x1024xf32>
   %2 = tensor.dim %1, %c0 : tensor<?x1024xf32>
   "test.sink"(%2) : (index) -> ()
-  return
+  util.return
 }
 
 // CHECK: #[[MAP:.+]] = affine_map<()[s0] -> (-s0 + 250, 64)>
@@ -84,15 +84,15 @@
 
 // -----
 
-func.func @foldCastIntoStore(%arg0: !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>,
+util.func public @foldCastIntoStore(%arg0: !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>,
     %arg1 : tensor<3x?xf32>, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index) {
   %c3 = arith.constant 3 : index
   %0 = tensor.cast %arg1 : tensor<3x?xf32> to tensor<?x?xf32>
   flow.dispatch.tensor.store %0, %arg0, offsets = [3, 4, 5], sizes = [%c3, 1, %arg2], strides = [1, 1, 1]
       : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>{%arg3, %arg4, %arg5}
-  return
+  util.return
 }
-//      CHECK: func @foldCastIntoStore
+//      CHECK: util.func public @foldCastIntoStore
 // CHECK-SAME:     %[[ARG0:.+]]: !flow.dispatch.tensor<writeonly:tensor<?x?x?xf32>>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<3x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups.mlir
index 35bafd1..2ff07fe 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file %s | iree-opt --allow-unregistered-dialect --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @complexWorkgroupsUsage
-func.func @complexWorkgroupsUsage(
+util.func public @complexWorkgroupsUsage(
     // CHECK-SAME: %[[ARG0:.+]]: tensor<?x4xf32>
     %arg0 : tensor<?x4xf32>,
     // CHECK-SAME: %[[ARG1:.+]]: index
@@ -56,14 +56,14 @@
     // CHECK-NEXT: flow.return
     flow.return
   }
-  // CHECK: return %[[OUTER_RET0]] : tensor<4x?xf32>
-  return %0 : tensor<4x?xf32>
+  // CHECK: util.return %[[OUTER_RET0]] : tensor<4x?xf32>
+  util.return %0 : tensor<4x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @inplaceDispatch
-func.func @inplaceDispatch(
+util.func public @inplaceDispatch(
     // CHECK-SAME: %[[ARG0:.+]]: tensor<?x4xf32>
     %arg0: tensor<?x4xf32>,
     // CHECK-SAME: %[[ARG1:.+]]: index
@@ -88,15 +88,15 @@
     // CHECK-NEXT: flow.return
     flow.return
   }
-  // CHECK: return %[[OUTER_RET0]] : tensor<?x4xf32>
-  return %0 : tensor<?x4xf32>
+  // CHECK: util.return %[[OUTER_RET0]] : tensor<?x4xf32>
+  util.return %0 : tensor<?x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @dispatchWithCountRegion
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4xi32>)
-func.func @dispatchWithCountRegion(%arg0: tensor<4xi32>) -> tensor<4xi32> {
+util.func public @dispatchWithCountRegion(%arg0: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-DAG: %[[WORKGROUP_COUNT_X:.+]] = arith.constant 100
   %x = arith.constant 100 : index
   // CHECK-DAG: %[[WORKGROUP_COUNT_Y:.+]] = arith.constant 50
@@ -117,6 +117,6 @@
     // CHECK-NEXT: flow.return %[[X_CAPTURE]], %[[Y_CAPTURE]], %[[Z]]
     flow.return %x_capture, %y_capture, %z : index, index, index
   }
-  // CHECK: return %[[OUTER_RET0]] : tensor<4xi32>
-  return %0 : tensor<4xi32>
+  // CHECK: util.return %[[OUTER_RET0]] : tensor<4xi32>
+  util.return %0 : tensor<4xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
index 3ec11fb..6b1c799 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/dispatch_workgroups_folding.mlir

@@ -1,8 +1,8 @@
 // RUN: iree-opt --allow-unregistered-dialect --split-input-file --canonicalize --cse %s | iree-opt --allow-unregistered-dialect --split-input-file | FileCheck %s
 
-// CHECK-LABEL: func.func @dontInlineReadWrite
+// CHECK-LABEL: util.func public @dontInlineReadWrite
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<1x4xf32>)
-func.func @dontInlineReadWrite(%arg0: tensor<1x4xf32>) -> tensor<4x8xf32> {
+util.func public @dontInlineReadWrite(%arg0: tensor<1x4xf32>) -> tensor<4x8xf32> {
   // CHECK: %[[CST:.+]] = arith.constant dense<0.000000e+00> : tensor<4x8xf32>
   %cst = arith.constant dense<0.0> : tensor<4x8xf32>
   %x = arith.constant 100 : index
@@ -19,13 +19,13 @@
     flow.dispatch.tensor.store %0, %arg1_capture, offsets=[0, 0], sizes=[4, 8], strides=[1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor<readwrite:tensor<4x8xf32>>
     flow.return
   }
-  return %0 : tensor<4x8xf32>
+  util.return %0 : tensor<4x8xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @remove_unused_result
-func.func @remove_unused_result(%arg0 : tensor<9xi32>, %arg1 : tensor<9xi32>) -> (tensor<i32>) {
+// CHECK-LABEL: util.func public @remove_unused_result
+util.func public @remove_unused_result(%arg0 : tensor<9xi32>, %arg1 : tensor<9xi32>) -> (tensor<i32>) {
   %c1 = arith.constant 1 : index
   //      CHECK: flow.dispatch.workgroups[%c1]() : () -> tensor<i32> =
   // CHECK-NEXT:   (%{{.+}}: !flow.dispatch.tensor<writeonly:tensor<i32>>)
@@ -44,13 +44,13 @@
     flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:tensor<i32>>
     flow.return
   }
-  return %0#0 : tensor<i32>
+  util.return %0#0 : tensor<i32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @remove_unused_dynamic_result
-func.func @remove_unused_dynamic_result(%dim: index) -> (tensor<i32>) {
+// CHECK-LABEL: util.func public @remove_unused_dynamic_result
+util.func public @remove_unused_dynamic_result(%dim: index) -> (tensor<i32>) {
   %c1 = arith.constant 1 : index
   //      CHECK: flow.dispatch.workgroups[%c1]() : () -> tensor<i32> =
   // CHECK-NEXT:   (%{{.+}}: !flow.dispatch.tensor<writeonly:tensor<i32>>)
@@ -73,13 +73,13 @@
     flow.dispatch.tensor.store %ret1_value, %ret1_shaped, offsets = [0], sizes = [%dim], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?xi32>>{%dim}
     flow.return
   }
-  return %0#0 : tensor<i32>
+  util.return %0#0 : tensor<i32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @remove_unused_read_write_result
-func.func @remove_unused_read_write_result(%arg0 : tensor<9xi32>, %arg1 : tensor<9xi32>) -> (tensor<i32>) {
+// CHECK-LABEL: util.func public @remove_unused_read_write_result
+util.func public @remove_unused_read_write_result(%arg0 : tensor<9xi32>, %arg1 : tensor<9xi32>) -> (tensor<i32>) {
   %c1 = arith.constant 1 : index
   //      CHECK: flow.dispatch.workgroups[%c1]() : () -> tensor<i32> =
   // CHECK-NEXT:   (%{{.+}}: !flow.dispatch.tensor<writeonly:tensor<i32>>)
@@ -98,13 +98,13 @@
     flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<readwrite:tensor<i32>>
     flow.return
   }
-  return %0#0 : tensor<i32>
+  util.return %0#0 : tensor<i32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @keep_used_read_write_result
-func.func @keep_used_read_write_result(%arg0 : tensor<9xi32>, %arg1 : tensor<9xi32>) -> (tensor<i32>) {
+// CHECK-LABEL: util.func public @keep_used_read_write_result
+util.func public @keep_used_read_write_result(%arg0 : tensor<9xi32>, %arg1 : tensor<9xi32>) -> (tensor<i32>) {
   %c1 = arith.constant 1 : index
   //      CHECK: flow.dispatch.workgroups[%c1]() : () -> (tensor<i32>, tensor<i32>) =
   // CHECK-NEXT:   (%{{.+}}: !flow.dispatch.tensor<writeonly:tensor<i32>>, %{{.+}}: !flow.dispatch.tensor<readwrite:tensor<i32>>)
@@ -121,13 +121,13 @@
     flow.dispatch.tensor.store %4, %arg3, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<readwrite:tensor<i32>>
     flow.return
   }
-  return %0#0 : tensor<i32>
+  util.return %0#0 : tensor<i32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @drop_unused_dispatch_region_result
-func.func @drop_unused_dispatch_region_result(
+// CHECK-LABEL: util.func public @drop_unused_dispatch_region_result
+util.func public @drop_unused_dispatch_region_result(
     %arg0: tensor<?x?xf32>, %arg1: tensor<5x10xf32>, %arg2: tensor<7x11xf32>)
   -> tensor<?x?xf32>
 {
@@ -144,14 +144,14 @@
     %1 = tensor.insert_slice %arg2 into %0[9, 10][7, 11][1, 1] : tensor<7x11xf32> into tensor<?x?xf32>
     flow.return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32>
   }
-  // CHECK: return %[[r]]
-  return %r#0 : tensor<?x?xf32>
+  // CHECK: util.return %[[r]]
+  util.return %r#0 : tensor<?x?xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @bubble_up_ordinal_ops(
-func.func @bubble_up_ordinal_ops(%arg0 : index, %arg1 : index) -> tensor<?x?xf32> {
+// CHECK-LABEL: util.func public @bubble_up_ordinal_ops(
+util.func public @bubble_up_ordinal_ops(%arg0 : index, %arg1 : index) -> tensor<?x?xf32> {
   %result = flow.dispatch.workgroups[%arg0, %arg1](%arg0, %arg1) : (index, index) -> (tensor<?x?xf32>{%arg0, %arg1}) =
       (%b0 : index, %b1 : index, %b2 : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
     //      CHECK: flow.dispatch.workgroups
@@ -174,13 +174,13 @@
         : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%wl0, %wl1}
     flow.return
   }
-  return %result : tensor<?x?xf32>
+  util.return %result : tensor<?x?xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @dedup_workgroup_count_from_slice_operands(
-func.func @dedup_workgroup_count_from_slice_operands(
+// CHECK-LABEL: util.func public @dedup_workgroup_count_from_slice_operands(
+util.func public @dedup_workgroup_count_from_slice_operands(
   %arg0 : index, %arg1 : index, %arg2 : index) -> tensor<?x?x?x?x?xf32> {
   %result = flow.dispatch.workgroups [%arg0, %arg1, %arg2](%arg0, %arg1, %arg2)
       : (index, index, index) -> tensor<?x?x?x?x?xf32>{%arg0, %arg1, %arg2, %arg2, %arg0} =
@@ -211,16 +211,16 @@
     %x, %y, %z = flow.dispatch.workgroup_count_from_slice %b0, %b1, %b2, %b2, %b0
     flow.return %x, %y, %z : index, index, index
   }
-  return %result :tensor<?x?x?x?x?xf32>
+  util.return %result :tensor<?x?x?x?x?xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @dedup_workload(
+// CHECK-LABEL: util.func public @dedup_workload(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index)
-func.func @dedup_workload(
+util.func public @dedup_workload(
   %arg0 : index, %arg1 : index, %arg2 : index) -> tensor<?x?x?x?x?xf32> {
   %result = flow.dispatch.workgroups [%arg0, %arg1, %arg2, %arg2, %arg0](%arg0, %arg1, %arg2)
       : (index, index, index) -> tensor<?x?x?x?x?xf32>{%arg0, %arg1, %arg2, %arg2, %arg0} =
@@ -251,5 +251,5 @@
     %x, %y, %z = flow.dispatch.workgroup_count_from_slice %b0, %b1, %b2, %b3, %b4
     flow.return %x, %y, %z : index, index, index
   }
-  return %result :tensor<?x?x?x?x?xf32>
+  util.return %result :tensor<?x?x?x?x?xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/resolve_dim_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/resolve_dim_ops.mlir
index a7321c3..25cffde 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/resolve_dim_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/resolve_dim_ops.mlir

@@ -1,6 +1,6 @@
 // RUN: iree-opt -resolve-ranked-shaped-type-result-dims -split-input-file %s | FileCheck %s
 
-func.func @tensor_load_op() -> (index, index) {
+util.func public @tensor_load_op() -> (index, index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %0 = hal.interface.constant.load[0] : index
@@ -11,9 +11,9 @@
       : !flow.dispatch.tensor<readonly:tensor<?x1x1x?xf32>>{%0, %1} -> tensor<?x?xf32>
   %4 = tensor.dim %3, %c0 : tensor<?x?xf32>
   %5 = tensor.dim %3, %c1 : tensor<?x?xf32>
-  return %4, %5 : index, index
+  util.return %4, %5 : index, index
 }
-// CHECK-LABEL: func @tensor_load_op()
+// CHECK-LABEL: util.func public @tensor_load_op()
 //   CHECK-DAG:   %[[D0:.+]] = hal.interface.constant.load[0]
 //   CHECK-DAG:   %[[D1:.+]] = hal.interface.constant.load[1]
-//       CHECK:   return %[[D0]], %[[D1]]
+//       CHECK:   util.return %[[D0]], %[[D1]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_folding.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_folding.mlir
index 57a972e..a07f17c 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_folding.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --canonicalize %s | FileCheck %s
 
 // CHECK-LABEL: @expandStaticShapeConstant
-func.func @expandStaticShapeConstant() -> (tensor<2x4xi32>, index, index) {
+util.func public @expandStaticShapeConstant() -> (tensor<2x4xi32>, index, index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-DAG: %[[CST:.+]] = arith.constant dense<2> : tensor<2x4xi32>
@@ -10,14 +10,14 @@
   %d0 = tensor.dim %0, %c0 : tensor<2x4xi32>
   // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
   %d1 = tensor.dim %0, %c1 : tensor<2x4xi32>
-  // CHECK: return %[[CST]], %[[C2]], %[[C4]]
-  return %0, %d0, %d1 : tensor<2x4xi32>, index, index
+  // CHECK: util.return %[[CST]], %[[C2]], %[[C4]]
+  util.return %0, %d0, %d1 : tensor<2x4xi32>, index, index
 }
 
 // -----
 
 // CHECK-LABEL: @expandDynamicShapeConstant
-func.func @expandDynamicShapeConstant() -> (tensor<?x?xi32>, index, index) {
+util.func public @expandDynamicShapeConstant() -> (tensor<?x?xi32>, index, index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-DAG: %[[CST:.+]] = arith.constant dense<2> : tensor<2x4xi32>
@@ -29,93 +29,93 @@
   %0 = flow.tensor.constant dense<2> : tensor<2x4xi32> -> tensor<?x?xi32>
   %d0 = tensor.dim %0, %c0 : tensor<?x?xi32>
   %d1 = tensor.dim %0, %c1 : tensor<?x?xi32>
-  // CHECK: return %[[T]], %[[D0]], %[[D1]]
-  return %0, %d0, %d1 : tensor<?x?xi32>, index, index
+  // CHECK: util.return %[[T]], %[[D0]], %[[D1]]
+  util.return %0, %d0, %d1 : tensor<?x?xi32>, index, index
 }
 
 // -----
 
 // CHECK-LABEL: @tieShapeStaticZeroElements
-func.func @tieShapeStaticZeroElements(%arg0: tensor<0xi32>) -> tensor<0xi32> {
+util.func public @tieShapeStaticZeroElements(%arg0: tensor<0xi32>) -> tensor<0xi32> {
   // CHECK-NOT: flow.tensor.tie_shape
   %0 = flow.tensor.tie_shape %arg0 : tensor<0xi32>
-  // CHECK: return %arg0
-  return %0 : tensor<0xi32>
+  // CHECK: util.return %arg0
+  util.return %0 : tensor<0xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @tieShapeDynamicZeroElements
 // CHECK-SAME: (%[[OPERAND:.+]]: tensor<0x?xi32>, %[[DIM:.+]]: index)
-func.func @tieShapeDynamicZeroElements(%arg0: tensor<0x?xi32>, %dim: index) -> tensor<0x?xi32> {
+util.func public @tieShapeDynamicZeroElements(%arg0: tensor<0x?xi32>, %dim: index) -> tensor<0x?xi32> {
   // CHECK-NOT: flow.tensor.tie_shape
   // CHECK: %[[RET:.+]] = flow.tensor.empty : tensor<0x?xi32>{%[[DIM]]}
   %0 = flow.tensor.tie_shape %arg0 : tensor<0x?xi32>{%dim}
-  // CHECK: return %[[RET]]
-  return %0 : tensor<0x?xi32>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : tensor<0x?xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeNoOpScalar
-func.func @reshapeNoOpScalar(%arg0: tensor<f32>) -> tensor<f32> {
-  // CHECK-NEXT: return %arg0 : tensor<f32>
+util.func public @reshapeNoOpScalar(%arg0: tensor<f32>) -> tensor<f32> {
+  // CHECK-NEXT: util.return %arg0 : tensor<f32>
   %0 = flow.tensor.reshape %arg0 : tensor<f32> -> tensor<f32>
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeNoOpStatic
-func.func @reshapeNoOpStatic(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
-  // CHECK-NEXT: return %arg0 : tensor<4x4xf32>
+util.func public @reshapeNoOpStatic(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  // CHECK-NEXT: util.return %arg0 : tensor<4x4xf32>
   %0 = flow.tensor.reshape %arg0 : tensor<4x4xf32> -> tensor<4x4xf32>
-  return %0 : tensor<4x4xf32>
+  util.return %0 : tensor<4x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @bitcastSameBitWidth
-func.func @bitcastSameBitWidth(%arg0: tensor<f32>) -> tensor<i32> {
+util.func public @bitcastSameBitWidth(%arg0: tensor<f32>) -> tensor<i32> {
   // CHECK-NEXT: flow.tensor.bitcast %arg0
   %0 = flow.tensor.bitcast %arg0 : tensor<f32> -> tensor<i32>
-  return %0 : tensor<i32>
+  util.return %0 : tensor<i32>
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeRankDifferent
-func.func @reshapeRankDifferent(%arg0: tensor<1xf32>) -> tensor<f32> {
+util.func public @reshapeRankDifferent(%arg0: tensor<1xf32>) -> tensor<f32> {
   // CHECK-NEXT: flow.tensor.reshape %arg0
   %0 = flow.tensor.reshape %arg0 : tensor<1xf32> -> tensor<f32>
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeStaticDifferent
-func.func @reshapeStaticDifferent(%arg0: tensor<1x4xf32>) -> tensor<4x1xf32> {
+util.func public @reshapeStaticDifferent(%arg0: tensor<1x4xf32>) -> tensor<4x1xf32> {
   // CHECK-NEXT: flow.tensor.reshape %arg0
   %0 = flow.tensor.reshape %arg0 : tensor<1x4xf32> -> tensor<4x1xf32>
-  return %0 : tensor<4x1xf32>
+  util.return %0 : tensor<4x1xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeNoOpDynamic
-func.func @reshapeNoOpDynamic(%arg0: tensor<4x?xf32>, %dim: index) -> tensor<4x?xf32> {
-  // CHECK-NEXT: return %arg0 : tensor<4x?xf32>
+util.func public @reshapeNoOpDynamic(%arg0: tensor<4x?xf32>, %dim: index) -> tensor<4x?xf32> {
+  // CHECK-NEXT: util.return %arg0 : tensor<4x?xf32>
   %0 = flow.tensor.reshape %arg0 : tensor<4x?xf32>{%dim} -> tensor<4x?xf32>{%dim}
-  return %0 : tensor<4x?xf32>
+  util.return %0 : tensor<4x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeDynamicDifferent
-func.func @reshapeDynamicDifferent(%arg0: tensor<4x?xf32>, %dim0: index, %dim1: index) -> tensor<4x?xf32> {
+util.func public @reshapeDynamicDifferent(%arg0: tensor<4x?xf32>, %dim0: index, %dim1: index) -> tensor<4x?xf32> {
   // CHECK-NEXT: flow.tensor.reshape %arg0
   %0 = flow.tensor.reshape %arg0 : tensor<4x?xf32>{%dim0} -> tensor<4x?xf32>{%dim1}
-  return %0 : tensor<4x?xf32>
+  util.return %0 : tensor<4x?xf32>
 }
 
 // -----
@@ -123,12 +123,12 @@
 // CHECK-LABEL: @flattenReshapeChain
 // CHECK-SAME: %[[ARG:.+]]: tensor<4x?xf32>,
 // CHECK-SAME: %[[DIM0:.+]]: index, %[[DIM1:.+]]: index, %[[DIM2:.+]]: index
-func.func @flattenReshapeChain(%arg0: tensor<4x?xf32>, %dim0: index, %dim1: index, %dim2: index) -> tensor<4x?xf32> {
+util.func public @flattenReshapeChain(%arg0: tensor<4x?xf32>, %dim0: index, %dim1: index, %dim2: index) -> tensor<4x?xf32> {
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.reshape %[[ARG]] : tensor<4x?xf32>{%[[DIM0]]} -> tensor<4x?xf32>{%[[DIM2]]}
   %0 = flow.tensor.reshape %arg0 : tensor<4x?xf32>{%dim0} -> tensor<4x?xf32>{%dim1}
   %1 = flow.tensor.reshape %0 : tensor<4x?xf32>{%dim1} -> tensor<4x?xf32>{%dim2}
-  // CHECK-NEXT: return %[[RET]]
-  return %1 : tensor<4x?xf32>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %1 : tensor<4x?xf32>
 }
 
 // -----
@@ -136,12 +136,12 @@
 // CHECK-LABEL: @flattenReshapeBitcastChain
 // CHECK-SAME: %[[ARG:.+]]: tensor<4x?xi16>,
 // CHECK-SAME: %[[DIM0:.+]]: index, %[[DIM1:.+]]: index, %[[DIM2:.+]]: index
-func.func @flattenReshapeBitcastChain(%arg0: tensor<4x?xi16>, %dim0: index, %dim1: index, %dim2: index) -> tensor<4x?xbf16> {
+util.func public @flattenReshapeBitcastChain(%arg0: tensor<4x?xi16>, %dim0: index, %dim1: index, %dim2: index) -> tensor<4x?xbf16> {
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.bitcast %[[ARG]] : tensor<4x?xi16>{%[[DIM0]]} -> tensor<4x?xbf16>{%[[DIM2]]}
   %0 = flow.tensor.bitcast %arg0 : tensor<4x?xi16>{%dim0} -> tensor<4x?xf16>{%dim1}
   %1 = flow.tensor.bitcast %0 : tensor<4x?xf16>{%dim1} -> tensor<4x?xbf16>{%dim2}
-  // CHECK-NEXT: return %[[RET]]
-  return %1 : tensor<4x?xbf16>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %1 : tensor<4x?xbf16>
 }
 
 // -----
@@ -149,12 +149,12 @@
 // CHECK-LABEL: @flattenBitCastChain
 // CHECK-SAME: %[[ARG:.+]]: tensor<?x4xi16>,
 // CHECK-SAME: %[[DIM0:.+]]: index, %[[DIM1:.+]]: index, %[[DIM2:.+]]: index
-func.func @flattenBitCastChain(%arg0: tensor<?x4xi16>, %dim0: index, %dim1: index, %dim2: index) -> tensor<?x8xi8> {
+util.func public @flattenBitCastChain(%arg0: tensor<?x4xi16>, %dim0: index, %dim1: index, %dim2: index) -> tensor<?x8xi8> {
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.bitcast %[[ARG]] : tensor<?x4xi16>{%[[DIM0]]} -> tensor<?x8xi8>{%[[DIM2]]}
   %0 = flow.tensor.bitcast %arg0 : tensor<?x4xi16>{%dim0} -> tensor<?x2xi32>{%dim1}
   %1 = flow.tensor.bitcast %0 : tensor<?x2xi32>{%dim1} -> tensor<?x8xi8>{%dim2}
-  // CHECK-NEXT: return %[[RET]]
-  return %1 : tensor<?x8xi8>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %1 : tensor<?x8xi8>
 }
 
 // -----
@@ -162,13 +162,13 @@
 // CHECK-LABEL: @flattenBitCastReshapeBitCast
 // CHECK-SAME: %[[ARG:.+]]: tensor<?x16xi16>,
 // CHECK-SAME: %[[DIM0:.+]]: index, %[[DIM1:.+]]: index, %[[DIM2:.+]]: index, %[[DIM3:.+]]: index
-func.func @flattenBitCastReshapeBitCast(%arg0: tensor<?x16xi16>, %dim0: index, %dim1: index, %dim2: index, %dim3: index) -> tensor<?x4x4xi16> {
+util.func public @flattenBitCastReshapeBitCast(%arg0: tensor<?x16xi16>, %dim0: index, %dim1: index, %dim2: index, %dim3: index) -> tensor<?x4x4xi16> {
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.reshape %[[ARG]] : tensor<?x16xi16>{%[[DIM0]]} -> tensor<?x4x4xi16>{%[[DIM3]]}
   %0 = flow.tensor.bitcast %arg0 : tensor<?x16xi16>{%dim0} -> tensor<?x8xi32>{%dim1}
   %1 = flow.tensor.reshape %0 : tensor<?x8xi32>{%dim1} -> tensor<?x4x2xi32>{%dim2}
   %2 = flow.tensor.bitcast %1 : tensor<?x4x2xi32>{%dim2} -> tensor<?x4x4xi16>{%dim3}
-  // CHECK-NEXT: return %[[RET]]
-  return %2 : tensor<?x4x4xi16>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %2 : tensor<?x4x4xi16>
 }
 
 
@@ -176,88 +176,88 @@
 
 // CHECK-LABEL: @reshapeFromStaticZeroElements
 // CHECK-SAME: (%[[OPERAND:.+]]: tensor<4x0xf32>, %[[DIM:.+]]: index)
-func.func @reshapeFromStaticZeroElements(%arg0: tensor<4x0xf32>, %dim: index) -> tensor<4x?xf32> {
+util.func public @reshapeFromStaticZeroElements(%arg0: tensor<4x0xf32>, %dim: index) -> tensor<4x?xf32> {
   // CHECK: %[[RET:.+]] = flow.tensor.empty : tensor<4x?xf32>{%[[DIM]]}
   %0 = flow.tensor.reshape %arg0 : tensor<4x0xf32> -> tensor<4x?xf32>{%dim}
-  // CHECK-NEXT: return %[[RET]]
-  return %0 : tensor<4x?xf32>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %0 : tensor<4x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeFromDynamicZeroElements
 // CHECK-SAME: (%[[OPERAND:.+]]: tensor<0x?xf32>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
-func.func @reshapeFromDynamicZeroElements(%arg0: tensor<0x?xf32>, %dim0: index, %dim1: index) -> tensor<4x?xf32> {
+util.func public @reshapeFromDynamicZeroElements(%arg0: tensor<0x?xf32>, %dim0: index, %dim1: index) -> tensor<4x?xf32> {
   // CHECK: %[[RET:.+]] = flow.tensor.empty : tensor<4x?xf32>{%[[DIM1]]}
   %0 = flow.tensor.reshape %arg0 : tensor<0x?xf32>{%dim0} -> tensor<4x?xf32>{%dim1}
-  // CHECK-NEXT: return %[[RET]]
-  return %0 : tensor<4x?xf32>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %0 : tensor<4x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeToStaticZeroElements
-func.func @reshapeToStaticZeroElements(%arg0: tensor<4x?xf32>, %dim0: index) {
+util.func public @reshapeToStaticZeroElements(%arg0: tensor<4x?xf32>, %dim0: index) {
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.empty : tensor<4x0xf32>
   %0 = flow.tensor.reshape %arg0 : tensor<4x?xf32>{%dim0} -> tensor<4x0xf32>
   // CHECK-NEXT: util.optimization_barrier %[[RET]]
   util.optimization_barrier %0 : tensor<4x0xf32>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeToDynamicZeroElements
 // CHECK-SAME: (%[[OPERAND:.+]]: tensor<4x?xf32>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
-func.func @reshapeToDynamicZeroElements(%arg0: tensor<4x?xf32>, %dim0: index, %dim1: index) {
+util.func public @reshapeToDynamicZeroElements(%arg0: tensor<4x?xf32>, %dim0: index, %dim1: index) {
   // CHECK: %[[RET:.+]] = flow.tensor.empty : tensor<0x?xf32>{%[[DIM1]]}
   %0 = flow.tensor.reshape %arg0 : tensor<4x?xf32>{%dim0} -> tensor<0x?xf32>{%dim1}
   // CHECK-NEXT: util.optimization_barrier %[[RET]]
   util.optimization_barrier %0 : tensor<0x?xf32>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @reshapeEmpty
 // CHECK-SAME: (%[[DIM:.+]]: index)
-func.func @reshapeEmpty(%dim: index) -> tensor<?xi32> {
+util.func public @reshapeEmpty(%dim: index) -> tensor<?xi32> {
   // CHECK: %[[RET:.+]] = flow.tensor.empty : tensor<?xi32>{%[[DIM]]}
   %0 = flow.tensor.empty : tensor<1x?xi32>{%dim}
   // CHECK-NOT: flow.tensor.reshape
   %1 = flow.tensor.reshape %0 : tensor<1x?xi32>{%dim} -> tensor<?xi32>{%dim}
-  // CHECK: return %[[RET]]
-  return %1 : tensor<?xi32>
+  // CHECK: util.return %[[RET]]
+  util.return %1 : tensor<?xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @loadConst
-func.func @loadConst() -> i32 {
+util.func public @loadConst() -> i32 {
   %0 = arith.constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-NEXT: %[[C2:.+]] = arith.constant 2 : i32
   %2 = flow.tensor.load %0[%c1, %c0] : tensor<2x2xi32>
-  // CHECK-NEXT: return %[[C2]]
-  return %2 : i32
+  // CHECK-NEXT: util.return %[[C2]]
+  util.return %2 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @loadConstScalar
-func.func @loadConstScalar() -> i32 {
+util.func public @loadConstScalar() -> i32 {
   %0 = arith.constant dense<4> : tensor<i32>
   // CHECK-NEXT: %[[C4:.+]] = arith.constant 4 : i32
   %1 = flow.tensor.load %0 : tensor<i32>
-  // CHECK-NEXT: return %[[C4]]
-  return %1 : i32
+  // CHECK-NEXT: util.return %[[C4]]
+  util.return %1 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @storeConst
-func.func @storeConst() -> tensor<2x2xi32> {
+util.func public @storeConst() -> tensor<2x2xi32> {
   %0 = arith.constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -266,27 +266,27 @@
   // CHECK-SAME:     [0, 1], [4, 3]
   // CHECK-SAME: ]> : tensor<2x2xi32>
   %1 = flow.tensor.store %c4, %0[%c1, %c0] : tensor<2x2xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<2x2xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<2x2xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @storeConstScalar
-func.func @storeConstScalar() -> tensor<i32> {
+util.func public @storeConstScalar() -> tensor<i32> {
   %0 = arith.constant dense<0> : tensor<i32>
   %1 = arith.constant 4 : i32
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<4> : tensor<i32>
   %2 = flow.tensor.store %1, %0 : tensor<i32>
-  // CHECK-NEXT: return %[[C]]
-  return %2 : tensor<i32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %2 : tensor<i32>
 }
 
 // -----
 
 // CHECK-LABEL: @allocaDims
 //  CHECK-SAME: (%[[DIM:.+]]: index)
-func.func @allocaDims(%dim: index) -> (index, index, index) {
+util.func public @allocaDims(%dim: index) -> (index, index, index) {
   // CHECK-NOT: flow.tensor.alloca
   %0 = flow.tensor.alloca : tensor<4x?x0xf32>{%dim}
   %c0 = arith.constant 0 : index
@@ -295,15 +295,15 @@
   %d0 = tensor.dim %0, %c0 : tensor<4x?x0xf32>
   %d1 = tensor.dim %0, %c1 : tensor<4x?x0xf32>
   %d2 = tensor.dim %0, %c2 : tensor<4x?x0xf32>
-  // CHECK: return %c4, %[[DIM]], %c0
-  return %d0, %d1, %d2 : index, index, index
+  // CHECK: util.return %c4, %[[DIM]], %c0
+  util.return %d0, %d1, %d2 : index, index, index
 }
 
 // -----
 
 // CHECK-LABEL: @emptyDims
 //  CHECK-SAME: (%[[DIM:.+]]: index)
-func.func @emptyDims(%dim: index) -> (index, index, index) {
+util.func public @emptyDims(%dim: index) -> (index, index, index) {
   // CHECK-NOT: flow.tensor.empty
   %0 = flow.tensor.empty : tensor<4x?x0xf32>{%dim}
   %c0 = arith.constant 0 : index
@@ -312,130 +312,130 @@
   %d0 = tensor.dim %0, %c0 : tensor<4x?x0xf32>
   %d1 = tensor.dim %0, %c1 : tensor<4x?x0xf32>
   %d2 = tensor.dim %0, %c2 : tensor<4x?x0xf32>
-  // CHECK: return %c4, %[[DIM]], %c0
-  return %d0, %d1, %d2 : index, index, index
+  // CHECK: util.return %c4, %[[DIM]], %c0
+  util.return %d0, %d1, %d2 : index, index, index
 }
 
 // -----
 
 // CHECK-LABEL: @splatDynamicShape
 //  CHECK-SAME: (%[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
-func.func @splatDynamicShape(%dim0: index, %dim1: index) -> tensor<?x?xi32> {
+util.func public @splatDynamicShape(%dim0: index, %dim1: index) -> tensor<?x?xi32> {
   // CHECK: %[[FOUR:.+]] = arith.constant 4 : i32
   %four = arith.constant 4 : i32
   // CHECK: %[[SPLAT:.+]] = flow.tensor.splat %[[FOUR]] : tensor<?x?xi32>{%[[DIM0]], %[[DIM1]]}
   %1 = flow.tensor.splat %four : tensor<?x?xi32>{%dim0, %dim1}
-  // CHECK: return %[[SPLAT]]
-  return %1 : tensor<?x?xi32>
+  // CHECK: util.return %[[SPLAT]]
+  util.return %1 : tensor<?x?xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @splatStaticZeroElements
-func.func @splatStaticZeroElements(%value: f32) -> tensor<0x2xf32> {
+util.func public @splatStaticZeroElements(%value: f32) -> tensor<0x2xf32> {
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.empty : tensor<0x2xf32>
   %0 = flow.tensor.splat %value : tensor<0x2xf32>
-  // CHECK-NEXT: return %[[RET]]
-  return %0 : tensor<0x2xf32>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %0 : tensor<0x2xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @splatDynamicZeroElements
 //  CHECK-SAME: (%[[VALUE:.+]]: f32, %[[DIM:.+]]: index)
-func.func @splatDynamicZeroElements(%value: f32, %dim: index) -> tensor<0x?xf32> {
+util.func public @splatDynamicZeroElements(%value: f32, %dim: index) -> tensor<0x?xf32> {
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.empty : tensor<0x?xf32>{%[[DIM]]}
   %0 = flow.tensor.splat %value : tensor<0x?xf32>{%dim}
-  // CHECK-NEXT: return %[[RET]]
-  return %0 : tensor<0x?xf32>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %0 : tensor<0x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @cloneConst
-func.func @cloneConst() -> tensor<4xi32> {
+util.func public @cloneConst() -> tensor<4xi32> {
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
   %0 = arith.constant dense<[0, 1, 2, 3]> : tensor<4xi32>
   %1 = flow.tensor.clone %0 : tensor<4xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<4xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<4xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @cloneConstZeroElements
-func.func @cloneConstZeroElements() -> tensor<0x2xi32> {
+util.func public @cloneConstZeroElements() -> tensor<0x2xi32> {
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<> : tensor<0x2xi32>
   %0 = arith.constant dense<> : tensor<0x2xi32>
   // CHECK-NOT: flow.tensor.clone
   %1 = flow.tensor.clone %0 : tensor<0x2xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<0x2xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<0x2xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @cloneStaticZeroElements
-func.func @cloneStaticZeroElements(%arg0: tensor<0x2xf32>) -> tensor<0x2xf32> {
+util.func public @cloneStaticZeroElements(%arg0: tensor<0x2xf32>) -> tensor<0x2xf32> {
   // CHECK-NOT: flow.tensor.clone
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.empty : tensor<0x2xf32>
   %0 = flow.tensor.clone %arg0 : tensor<0x2xf32>
   // CHECK-NEXT: %[[RET]]
-  return %0 : tensor<0x2xf32>
+  util.return %0 : tensor<0x2xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @cloneDynamicZeroElements
 //  CHECK-SAME: (%[[OPERAND:.+]]: tensor<0x?xf32>, %[[DIM:.+]]: index)
-func.func @cloneDynamicZeroElements(%arg0: tensor<0x?xf32>, %dim: index) -> tensor<0x?xf32> {
+util.func public @cloneDynamicZeroElements(%arg0: tensor<0x?xf32>, %dim: index) -> tensor<0x?xf32> {
   // CHECK-NOT: flow.tensor.clone
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.empty : tensor<0x?xf32>{%[[DIM]]}
   %0 = flow.tensor.clone %arg0 : tensor<0x?xf32>{%dim}
   // CHECK-NEXT: %[[RET]]
-  return %0 : tensor<0x?xf32>
+  util.return %0 : tensor<0x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceConst0D
-func.func @sliceConst0D() -> tensor<i32> {
+util.func public @sliceConst0D() -> tensor<i32> {
   %0 = arith.constant dense<0> : tensor<i32>
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<0> : tensor<i32>
   %1 = flow.tensor.slice %0[for] : tensor<i32> -> tensor<i32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<i32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<i32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceConst1D
-func.func @sliceConst1D() -> tensor<1xi32> {
+util.func public @sliceConst1D() -> tensor<1xi32> {
   %0 = arith.constant dense<0> : tensor<1xi32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<0> : tensor<1xi32>
   %1 = flow.tensor.slice %0[%c0 for %c1] : tensor<1xi32> -> tensor<1xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<1xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<1xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceConst1DZeroLength
-func.func @sliceConst1DZeroLength() -> tensor<0xi32> {
+util.func public @sliceConst1DZeroLength() -> tensor<0xi32> {
   %0 = arith.constant dense<0> : tensor<1xi32>
   %c0 = arith.constant 0 : index
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<> : tensor<0xi32>
   %1 = flow.tensor.slice %0[%c0 for %c0] : tensor<1xi32> -> tensor<0xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<0xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<0xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceConst2D
-func.func @sliceConst2D() -> tensor<1x2xi32> {
+util.func public @sliceConst2D() -> tensor<1x2xi32> {
   %0 = arith.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -444,63 +444,63 @@
   // CHECK-SAME:     [1, 2]
   // CHECK-SAME: ]> : tensor<1x2xi32>
   %1 = flow.tensor.slice %0[%c0, %c1 for %c1, %c2] : tensor<2x3xi32> -> tensor<1x2xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<1x2xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<1x2xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceConst2DZeroLength1
-func.func @sliceConst2DZeroLength1() -> tensor<1x0xi32> {
+util.func public @sliceConst2DZeroLength1() -> tensor<1x0xi32> {
   %0 = arith.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<> : tensor<1x0xi32>
   %1 = flow.tensor.slice %0[%c0, %c0 for %c1, %c0] : tensor<2x3xi32> -> tensor<1x0xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<1x0xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<1x0xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceConst2DZeroLength01
-func.func @sliceConst2DZeroLength01() -> tensor<0x0xi32> {
+util.func public @sliceConst2DZeroLength01() -> tensor<0x0xi32> {
   %0 = arith.constant dense<[[0, 1, 2], [3, 4, 5]]> : tensor<2x3xi32>
   %c0 = arith.constant 0 : index
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<> : tensor<0x0xi32>
   %1 = flow.tensor.slice %0[%c0, %c0 for %c0, %c0] : tensor<2x3xi32> -> tensor<0x0xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<0x0xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<0x0xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceFromZeroElements
-func.func @sliceFromZeroElements(%arg0: tensor<0xi32>) -> tensor<?xi32> {
+util.func public @sliceFromZeroElements(%arg0: tensor<0xi32>) -> tensor<?xi32> {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: flow.tensor.slice
   // CHECK: %[[RET:.+]] = flow.tensor.empty : tensor<?xi32>{%c0}
   %0 = flow.tensor.slice %arg0[%c0 for %c0] : tensor<0xi32> -> tensor<?xi32>{%c0}
-  // CHECK: return %[[RET]]
-  return %0 : tensor<?xi32>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : tensor<?xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceZeroElements
-func.func @sliceZeroElements(%arg0: tensor<?xi32>, %dim: index) -> tensor<0xi32> {
+util.func public @sliceZeroElements(%arg0: tensor<?xi32>, %dim: index) -> tensor<0xi32> {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: flow.tensor.slice
   // CHECK: %[[RET:.+]] = flow.tensor.empty : tensor<0xi32>
   %0 = flow.tensor.slice %arg0[%c0 for %c0] : tensor<?xi32>{%dim} -> tensor<0xi32>
-  // CHECK: return %[[RET]]
-  return %0 : tensor<0xi32>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : tensor<0xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @sliceConst3D
-func.func @sliceConst3D() -> tensor<1x2x3xi32> {
+util.func public @sliceConst3D() -> tensor<1x2x3xi32> {
   %0 = arith.constant dense<[[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[9, 10, 11], [12, 13, 14], [15, 16, 17]]]> : tensor<2x3x3xi32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -510,52 +510,52 @@
   // CHECK-SAME:                             [
   // CHECK-SAME:                              [3, 4, 5], [6, 7, 8]]]> : tensor<1x2x3xi32>
   %1 = flow.tensor.slice %0[%c0, %c1, %c0 for %c1, %c2, %c3] : tensor<2x3x3xi32> -> tensor<1x2x3xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %1 : tensor<1x2x3xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %1 : tensor<1x2x3xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateConst0D
-func.func @updateConst0D() -> tensor<i32> {
+util.func public @updateConst0D() -> tensor<i32> {
   %0 = arith.constant dense<0> : tensor<i32>
   %1 = arith.constant dense<1> : tensor<i32>
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<0> : tensor<i32>
   %2 = flow.tensor.update %0, %1[] : tensor<i32> -> tensor<i32>
-  // CHECK-NEXT: return %[[C]]
-  return %2 : tensor<i32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %2 : tensor<i32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateConst1D
-func.func @updateConst1D() -> tensor<1xi32> {
+util.func public @updateConst1D() -> tensor<1xi32> {
   %0 = arith.constant dense<0> : tensor<1xi32>
   %1 = arith.constant dense<1> : tensor<1xi32>
   %c0 = arith.constant 0 : index
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<0> : tensor<1xi32>
   %2 = flow.tensor.update %0, %1[%c0] : tensor<1xi32> -> tensor<1xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %2 : tensor<1xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %2 : tensor<1xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateConst1DUpdateZeroSize
-func.func @updateConst1DUpdateZeroSize() -> tensor<1xi32> {
+util.func public @updateConst1DUpdateZeroSize() -> tensor<1xi32> {
   %0 = arith.constant dense<> : tensor<0xi32>
   %1 = arith.constant dense<1> : tensor<1xi32>
   %c0 = arith.constant 0 : index
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<1> : tensor<1xi32>
   %2 = flow.tensor.update %0, %1[%c0] : tensor<0xi32> -> tensor<1xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %2 : tensor<1xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %2 : tensor<1xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateConst2DUpdate1x1
-func.func @updateConst2DUpdate1x1() -> tensor<3x4xi32> {
+util.func public @updateConst2DUpdate1x1() -> tensor<3x4xi32> {
   %0 = arith.constant dense<[[12]]> : tensor<1x1xi32>
   %1 = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]> : tensor<3x4xi32>
   %c0 = arith.constant 0 : index
@@ -563,14 +563,14 @@
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<[
   // CHECK-SAME: [0, 12, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]> : tensor<3x4xi32>
   %2 = flow.tensor.update %0, %1[%c0, %c1] : tensor<1x1xi32> -> tensor<3x4xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %2 : tensor<3x4xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %2 : tensor<3x4xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateConst2DUpdate2x2
-func.func @updateConst2DUpdate2x2() -> tensor<3x4xi32> {
+util.func public @updateConst2DUpdate2x2() -> tensor<3x4xi32> {
   %0 = arith.constant dense<[[12, 13], [14, 15]]> : tensor<2x2xi32>
   %1 = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]> : tensor<3x4xi32>
   %c0 = arith.constant 0 : index
@@ -578,14 +578,14 @@
   // CHECK-NEXT: %[[C:.+]] = arith.constant dense<[
   // CHECK-SAME: [0, 12, 13, 3], [4, 14, 15, 7], [8, 9, 10, 11]]> : tensor<3x4xi32>
   %2 = flow.tensor.update %0, %1[%c0, %c1] : tensor<2x2xi32> -> tensor<3x4xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %2 : tensor<3x4xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %2 : tensor<3x4xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateConst3DUpdate1x2x3
-func.func @updateConst3DUpdate1x2x3() -> tensor<2x3x3xi32> {
+util.func public @updateConst3DUpdate1x2x3() -> tensor<2x3x3xi32> {
   %0 = arith.constant dense<[[[18, 19, 20], [21, 22, 23]]]> : tensor<1x2x3xi32>
   %1 = arith.constant dense<[[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[9, 10, 11], [12, 13, 14], [15, 16, 17]]]> : tensor<2x3x3xi32>
   %c0 = arith.constant 0 : index
@@ -595,14 +595,14 @@
   // CHECK-SAME:                              [0, 1, 2], [18, 19, 20], [21, 22, 23]], [
   // CHECK-SAME: [9, 10, 11], [12, 13, 14], [15, 16, 17]]]> : tensor<2x3x3xi32>
   %2 = flow.tensor.update %0, %1[%c0, %c1, %c0] : tensor<1x2x3xi32> -> tensor<2x3x3xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %2 : tensor<2x3x3xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %2 : tensor<2x3x3xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateConst3DUpdate2x3x2
-func.func @updateConst3DUpdate2x3x2() -> tensor<2x3x3xi32> {
+util.func public @updateConst3DUpdate2x3x2() -> tensor<2x3x3xi32> {
   %0 = arith.constant dense<[[[18, 19], [20, 21], [22, 23]], [[24, 25], [26, 27], [28, 29]]]> : tensor<2x3x2xi32>
   %1 = arith.constant dense<[[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[9, 10, 11], [12, 13, 14], [15, 16, 17]]]> : tensor<2x3x3xi32>
   %c0 = arith.constant 0 : index
@@ -612,48 +612,48 @@
   // CHECK-SAME:                              [18, 19, 2], [20, 21, 5], [22, 23, 8]], [
   // CHECK-SAME: [24, 25, 11], [26, 27, 14], [28, 29, 17]]]> : tensor<2x3x3xi32>
   %2 = flow.tensor.update %0, %1[%c0, %c1, %c0] : tensor<2x3x2xi32> -> tensor<2x3x3xi32>
-  // CHECK-NEXT: return %[[C]]
-  return %2 : tensor<2x3x3xi32>
+  // CHECK-NEXT: util.return %[[C]]
+  util.return %2 : tensor<2x3x3xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateReplace
-func.func @updateReplace(%arg0 : tensor<4xi32>, %arg1 : tensor<4xi32>) -> tensor<4xi32> {
+util.func public @updateReplace(%arg0 : tensor<4xi32>, %arg1 : tensor<4xi32>) -> tensor<4xi32> {
   %c0 = arith.constant 0 : index
   %0 = flow.tensor.update %arg0, %arg1[%c0] : tensor<4xi32> -> tensor<4xi32>
-  // CHECK-NEXT: return %arg0
-  return %0 : tensor<4xi32>
+  // CHECK-NEXT: util.return %arg0
+  util.return %0 : tensor<4xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateIntoZeroElements
-func.func @updateIntoZeroElements(%update: tensor<?x?xi32>, %dim: index, %target: tensor<0x0xi32>) -> tensor<0x0xi32> {
+util.func public @updateIntoZeroElements(%update: tensor<?x?xi32>, %dim: index, %target: tensor<0x0xi32>) -> tensor<0x0xi32> {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: flow.tensor.update
   // CHECK-NEXT: %[[RET:.+]] = flow.tensor.empty : tensor<0x0xi32>
   %0 = flow.tensor.update %update, %target[%c0, %c0] : tensor<?x?xi32>{%dim, %dim} -> tensor<0x0xi32>
-  // CHECK-NEXT: return %[[RET]]
-  return %0 : tensor<0x0xi32>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %0 : tensor<0x0xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @updateZeroElements
 //  CHECK-SAME: (%[[UPDATE:.+]]: tensor<0x1xi32>, %[[TARGET:.+]]: tensor<1x1xi32>)
-func.func @updateZeroElements(%update: tensor<0x1xi32>, %target: tensor<1x1xi32>) -> tensor<1x1xi32> {
+util.func public @updateZeroElements(%update: tensor<0x1xi32>, %target: tensor<1x1xi32>) -> tensor<1x1xi32> {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: flow.tensor.update
   %0 = flow.tensor.update %update, %target[%c0, %c0] : tensor<0x1xi32> -> tensor<1x1xi32>
-  // CHECK: return %[[TARGET]]
-  return %0 : tensor<1x1xi32>
+  // CHECK: util.return %[[TARGET]]
+  util.return %0 : tensor<1x1xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @propogateStaticShapeOfTarget
-func.func @propogateStaticShapeOfTarget(%arg0 : tensor<?x?xf32>, %arg1 : f32) -> tensor<?x?xf32> {
+util.func public @propogateStaticShapeOfTarget(%arg0 : tensor<?x?xf32>, %arg1 : f32) -> tensor<?x?xf32> {
   %c21 = arith.constant 21 : index
   %c42 = arith.constant 42 : index
   %c2 = arith.constant 2 : index
@@ -667,14 +667,14 @@
   // CHECK: %[[UPDATED:.+]] = flow.tensor.update %{{.+}}, %[[TARGET]]
   // CHECK: %[[RESULT:.+]] = tensor.cast %[[UPDATED]] : tensor<21x42xf32> to tensor<?x?xf32>
   %1 = flow.tensor.update %arg0, %0[%c2, %c4] : tensor<?x?xf32>{%c21, %c42} -> tensor<?x?xf32>{%c21, %c42}
-  // CHECK: return %[[RESULT]]
-  return %1 : tensor<?x?xf32>
+  // CHECK: util.return %[[RESULT]]
+  util.return %1 : tensor<?x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @propogateStaticShapeOfUpdate
-func.func @propogateStaticShapeOfUpdate(%arg0 : tensor<?x?xf32>, %arg1 : f32) -> tensor<?x?xf32> {
+util.func public @propogateStaticShapeOfUpdate(%arg0 : tensor<?x?xf32>, %arg1 : f32) -> tensor<?x?xf32> {
   %c21 = arith.constant 21 : index
   %c42 = arith.constant 42 : index
   %c2 = arith.constant 2 : index
@@ -687,53 +687,53 @@
   } :  tensor<?x?xf32>
   // CHECK: %[[RESULT:.+]] = flow.tensor.update  %[[UPDATE]]
   %1 = flow.tensor.update %0, %arg0[%c2, %c4] : tensor<?x?xf32>{%c21, %c42} -> tensor<?x?xf32>{%c21, %c42}
-  // CHECK: return %[[RESULT]]
-  return %1 : tensor<?x?xf32>
+  // CHECK: util.return %[[RESULT]]
+  util.return %1 : tensor<?x?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @foldSplatLoadIntoPrimitive
 //  CHECK-SAME: (%[[arg0:.+]]: f32, %[[arg1:.+]]: index, %[[arg2:.+]]: index)
-func.func @foldSplatLoadIntoPrimitive(%arg0 : f32, %arg1 : index, %arg2 : index) -> f32 {
-  // CHECK-NEXT: return %[[arg0]] : f32
+util.func public @foldSplatLoadIntoPrimitive(%arg0 : f32, %arg1 : index, %arg2 : index) -> f32 {
+  // CHECK-NEXT: util.return %[[arg0]] : f32
   %0 = flow.tensor.splat %arg0 : tensor<4x4xf32>
   %1 = flow.tensor.load %0[%arg1, %arg2] : tensor<4x4xf32>
-  return %1 : f32
+  util.return %1 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @foldSplatReshapeIntoSplat
-func.func @foldSplatReshapeIntoSplat(%arg0 : f32) -> tensor<16xf32> {
+util.func public @foldSplatReshapeIntoSplat(%arg0 : f32) -> tensor<16xf32> {
   // CHECK-NEXT: %0 = flow.tensor.splat %arg0 : tensor<16xf32>
-  // CHECK-NEXT: return %0 : tensor<16xf32>
+  // CHECK-NEXT: util.return %0 : tensor<16xf32>
   %0 = flow.tensor.splat %arg0 : tensor<4x4xf32>
   %1 = flow.tensor.reshape %0 : tensor<4x4xf32> -> tensor<16xf32>
-  return %1 : tensor<16xf32>
+  util.return %1 : tensor<16xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @foldSplatReshapeIntoSplatDynamic
-func.func @foldSplatReshapeIntoSplatDynamic(%arg0 : f32, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<?x?xf32> {
+util.func public @foldSplatReshapeIntoSplatDynamic(%arg0 : f32, %arg1 : index, %arg2 : index, %arg3 : index) -> tensor<?x?xf32> {
   // CHECK-NEXT: %0 = flow.tensor.splat %arg0 : tensor<?x?xf32>{%arg2, %arg3}
-  // CHECK-NEXT: return %0 : tensor<?x?xf32>
+  // CHECK-NEXT: util.return %0 : tensor<?x?xf32>
   %0 = flow.tensor.splat %arg0 : tensor<?x4xf32>{%arg1}
   %1 = flow.tensor.reshape %0 : tensor<?x4xf32>{%arg1} -> tensor<?x?xf32>{%arg2, %arg3}
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
 
 // -----
 
-func.func @innermost_unit_dim(%4: !flow.dispatch.tensor<readonly:tensor<3x1x16x257x88xf16>>,
+util.func public @innermost_unit_dim(%4: !flow.dispatch.tensor<readonly:tensor<3x1x16x257x88xf16>>,
     %arg0: index, %arg2 : index, %10 : index, %9 : index) -> tensor<?x?x?xf16> {
   %c16 = arith.constant 16 : index
   %c1 = arith.constant 1 : index
   %11 = flow.dispatch.tensor.load %4, offsets = [1, 0, %arg0, %10, %arg2], sizes = [1, 1, %c16, %9, %c1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1x16x257x88xf16>> -> tensor<?x?x?xf16>
-  return %11 : tensor<?x?x?xf16>
+  util.return %11 : tensor<?x?x?xf16>
 }
-// CHECK-LABEL: func @innermost_unit_dim
+// CHECK-LABEL: util.func public @innermost_unit_dim
 //  CHECK-SAME:     %[[DYNAMIC_DIM:[a-zA-Z0-9]+]]: index)
 //       CHECK:   flow.dispatch.tensor.load
 //  CHECK-SAME:       sizes = [1, 1, 16, %[[DYNAMIC_DIM]], 1]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_ops.mlir
index 091a923..b0a19ad 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_ops.mlir

@@ -1,200 +1,200 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @tensorReshape
-func.func @tensorReshape(%arg0 : tensor<4x4xf32>) -> tensor<16xf32> {
+util.func public @tensorReshape(%arg0 : tensor<4x4xf32>) -> tensor<16xf32> {
   // CHECK-NEXT: %0 = flow.tensor.reshape %arg0 : tensor<4x4xf32> -> tensor<16xf32>
   %0 = flow.tensor.reshape %arg0 : tensor<4x4xf32> -> tensor<16xf32>
-  return %0 : tensor<16xf32>
+  util.return %0 : tensor<16xf32>
 }
 
 // CHECK-LABEL: @tensorReshapeScalar
-func.func @tensorReshapeScalar(%arg0 : tensor<f32>) -> tensor<f32> {
+util.func public @tensorReshapeScalar(%arg0 : tensor<f32>) -> tensor<f32> {
   // CHECK-NEXT: %0 = flow.tensor.reshape %arg0 : tensor<f32> -> tensor<f32>
   %0 = flow.tensor.reshape %arg0 : tensor<f32> -> tensor<f32>
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: @tensorReshapeDynamic
-func.func @tensorReshapeDynamic(%arg0 : tensor<?x4xf32>) -> tensor<?x2xf32> {
+util.func public @tensorReshapeDynamic(%arg0 : tensor<?x4xf32>) -> tensor<?x2xf32> {
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
   // CHECK: %0 = flow.tensor.reshape %arg0 : tensor<?x4xf32>{%c4} -> tensor<?x2xf32>{%c8}
   %0 = flow.tensor.reshape %arg0 : tensor<?x4xf32>{%c4} -> tensor<?x2xf32>{%c8}
-  return %0 : tensor<?x2xf32>
+  util.return %0 : tensor<?x2xf32>
 }
 
 // CHECK-LABEL: @tensorReshapeComplex
-func.func @tensorReshapeComplex(%arg0 : tensor<4x4xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
+util.func public @tensorReshapeComplex(%arg0 : tensor<4x4xcomplex<f32>>) -> tensor<16xcomplex<f32>> {
   // CHECK-NEXT: flow.tensor.reshape %arg0 : tensor<4x4xcomplex<f32>> -> tensor<16xcomplex<f32>>
   %0 = flow.tensor.reshape %arg0 : tensor<4x4xcomplex<f32>> -> tensor<16xcomplex<f32>>
-  return %0 : tensor<16xcomplex<f32>>
+  util.return %0 : tensor<16xcomplex<f32>>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorBitCast
-func.func @tensorBitCast(%arg0 : tensor<16xi32>) -> tensor<4x8xi16> {
+util.func public @tensorBitCast(%arg0 : tensor<16xi32>) -> tensor<4x8xi16> {
   // CHECK-NEXT: %0 = flow.tensor.bitcast %arg0 : tensor<16xi32> -> tensor<4x8xi16>
   %0 = flow.tensor.bitcast %arg0 : tensor<16xi32> -> tensor<4x8xi16>
-  return %0 : tensor<4x8xi16>
+  util.return %0 : tensor<4x8xi16>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorLoad
-func.func @tensorLoad(%arg0 : tensor<4x4xf32>, %arg1 : index, %arg2 : index) -> f32 {
+util.func public @tensorLoad(%arg0 : tensor<4x4xf32>, %arg1 : index, %arg2 : index) -> f32 {
   // CHECK-NEXT: %0 = flow.tensor.load %arg0[%arg1, %arg2] : tensor<4x4xf32>
   %0 = flow.tensor.load %arg0[%arg1, %arg2] : tensor<4x4xf32>
-  return %0 : f32
+  util.return %0 : f32
 }
 
 // CHECK-LABEL: @tensorLoadScalar
-func.func @tensorLoadScalar(%arg0 : tensor<f32>) -> f32 {
+util.func public @tensorLoadScalar(%arg0 : tensor<f32>) -> f32 {
   // CHECK-NEXT: %0 = flow.tensor.load %arg0 : tensor<f32>
   %0 = flow.tensor.load %arg0 : tensor<f32>
-  return %0 : f32
+  util.return %0 : f32
 }
 
 // CHECK-LABEL: @tensorLoadDynamic
-func.func @tensorLoadDynamic(%arg0 : tensor<?x4xf32>, %arg1 : index, %arg2 : index) -> f32 {
+util.func public @tensorLoadDynamic(%arg0 : tensor<?x4xf32>, %arg1 : index, %arg2 : index) -> f32 {
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.tensor.load %arg0[%arg1, %arg2] : tensor<?x4xf32>{%c4}
   %0 = flow.tensor.load %arg0[%arg1, %arg2] : tensor<?x4xf32>{%c4}
-  return %0 : f32
+  util.return %0 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @tensorStore
-func.func @tensorStore(%arg0 : tensor<4x4xf32>, %arg1 : index, %arg2 : index, %arg3 : f32) -> tensor<4x4xf32> {
+util.func public @tensorStore(%arg0 : tensor<4x4xf32>, %arg1 : index, %arg2 : index, %arg3 : f32) -> tensor<4x4xf32> {
   // CHECK-NEXT: %0 = flow.tensor.store %arg3, %arg0[%arg1, %arg2] : tensor<4x4xf32>
   %0 = flow.tensor.store %arg3, %arg0[%arg1, %arg2] : tensor<4x4xf32>
-  return %0 : tensor<4x4xf32>
+  util.return %0 : tensor<4x4xf32>
 }
 
 // CHECK-LABEL: @tensorStoreScalar
-func.func @tensorStoreScalar(%arg0 : f32, %arg1 : tensor<f32>) -> tensor<f32> {
+util.func public @tensorStoreScalar(%arg0 : f32, %arg1 : tensor<f32>) -> tensor<f32> {
   // CHECK-NEXT: %0 = flow.tensor.store %arg0, %arg1 : tensor<f32>
   %0 = flow.tensor.store %arg0, %arg1 : tensor<f32>
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: @tensorStoreDynamic
-func.func @tensorStoreDynamic(%arg0 : tensor<?x4xf32>, %arg1 : index, %arg2 : index, %arg3 : f32) -> tensor<?x4xf32> {
+util.func public @tensorStoreDynamic(%arg0 : tensor<?x4xf32>, %arg1 : index, %arg2 : index, %arg3 : f32) -> tensor<?x4xf32> {
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.tensor.store %arg3, %arg0[%arg1, %arg2] : tensor<?x4xf32>{%c4}
   %0 = flow.tensor.store %arg3, %arg0[%arg1, %arg2] : tensor<?x4xf32>{%c4}
-  return %0 : tensor<?x4xf32>
+  util.return %0 : tensor<?x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorAlloca
-func.func @tensorAlloca(%arg0: index) -> tensor<?x0x1xf32> {
+util.func public @tensorAlloca(%arg0: index) -> tensor<?x0x1xf32> {
   // CHECK-NEXT: = flow.tensor.alloca : tensor<?x0x1xf32>{%arg0}
   %0 = flow.tensor.alloca : tensor<?x0x1xf32>{%arg0}
-  return %0 : tensor<?x0x1xf32>
+  util.return %0 : tensor<?x0x1xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorEmpty
-func.func @tensorEmpty(%arg0: index) -> tensor<?x0x1xf32> {
+util.func public @tensorEmpty(%arg0: index) -> tensor<?x0x1xf32> {
   // CHECK-NEXT: = flow.tensor.empty : tensor<?x0x1xf32>{%arg0}
   %0 = flow.tensor.empty : tensor<?x0x1xf32>{%arg0}
-  return %0 : tensor<?x0x1xf32>
+  util.return %0 : tensor<?x0x1xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorSplat
-func.func @tensorSplat(%arg0 : f32) -> tensor<4x4xf32> {
+util.func public @tensorSplat(%arg0 : f32) -> tensor<4x4xf32> {
   // CHECK-NEXT: %0 = flow.tensor.splat %arg0 : tensor<4x4xf32>
   %0 = flow.tensor.splat %arg0 : tensor<4x4xf32>
-  return %0 : tensor<4x4xf32>
+  util.return %0 : tensor<4x4xf32>
 }
 
 // CHECK-LABEL: @tensorSplatScalar
-func.func @tensorSplatScalar(%arg0 : f32) -> tensor<f32> {
+util.func public @tensorSplatScalar(%arg0 : f32) -> tensor<f32> {
   // CHECK-NEXT: %0 = flow.tensor.splat %arg0 : tensor<f32>
   %0 = flow.tensor.splat %arg0 : tensor<f32>
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: @tensorSplatDynamic
-func.func @tensorSplatDynamic(%arg0 : f32) -> tensor<?x4xf32> {
+util.func public @tensorSplatDynamic(%arg0 : f32) -> tensor<?x4xf32> {
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.tensor.splat %arg0 : tensor<?x4xf32>{%c4}
   %0 = flow.tensor.splat %arg0 : tensor<?x4xf32>{%c4}
-  return %0 : tensor<?x4xf32>
+  util.return %0 : tensor<?x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorClone
-func.func @tensorClone(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
+util.func public @tensorClone(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
   // CHECK-NEXT: %0 = flow.tensor.clone %arg0 : tensor<4x4xf32>
   %0 = flow.tensor.clone %arg0 : tensor<4x4xf32>
-  return %0 : tensor<4x4xf32>
+  util.return %0 : tensor<4x4xf32>
 }
 
 // CHECK-LABEL: @tensorCloneScalar
-func.func @tensorCloneScalar(%arg0 : tensor<f32>) -> tensor<f32> {
+util.func public @tensorCloneScalar(%arg0 : tensor<f32>) -> tensor<f32> {
   // CHECK-NEXT: %0 = flow.tensor.clone %arg0 : tensor<f32>
   %0 = flow.tensor.clone %arg0 : tensor<f32>
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: @tensorCloneDynamic
-func.func @tensorCloneDynamic(%arg0 : tensor<?x4xf32>) -> tensor<?x4xf32> {
+util.func public @tensorCloneDynamic(%arg0 : tensor<?x4xf32>) -> tensor<?x4xf32> {
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.tensor.clone %arg0 : tensor<?x4xf32>{%c4}
   %0 = flow.tensor.clone %arg0 : tensor<?x4xf32>{%c4}
-  return %0 : tensor<?x4xf32>
+  util.return %0 : tensor<?x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorSlice
-func.func @tensorSlice(%arg0 : tensor<4x4xf32>, %arg1 : index, %arg2 : index) -> tensor<2x2xf32> {
+util.func public @tensorSlice(%arg0 : tensor<4x4xf32>, %arg1 : index, %arg2 : index) -> tensor<2x2xf32> {
   // CHECK-NEXT: %0 = flow.tensor.slice %arg0[%arg1, %arg2 for %arg2, %arg1] : tensor<4x4xf32> -> tensor<2x2xf32>
   %0 = flow.tensor.slice %arg0[%arg1, %arg2 for %arg2, %arg1] : tensor<4x4xf32> -> tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
+  util.return %0 : tensor<2x2xf32>
 }
 
 // CHECK-LABEL: @tensorSliceDynamic
-func.func @tensorSliceDynamic(%arg0 : tensor<?x4xf32>, %arg1 : index, %arg2 : index) -> tensor<?x2xf32> {
+util.func public @tensorSliceDynamic(%arg0 : tensor<?x4xf32>, %arg1 : index, %arg2 : index) -> tensor<?x2xf32> {
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.tensor.slice %arg0[%arg1, %arg2 for %arg2, %arg1] : tensor<?x4xf32>{%c4} -> tensor<?x2xf32>{%c2}
   %0 = flow.tensor.slice %arg0[%arg1, %arg2 for %arg2, %arg1] : tensor<?x4xf32>{%c4} -> tensor<?x2xf32>{%c2}
-  return %0 : tensor<?x2xf32>
+  util.return %0 : tensor<?x2xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorUpdate
-func.func @tensorUpdate(%arg0 : tensor<2x2xf32>, %arg1 : tensor<4x4xf32>, %arg2 : index, %arg3 : index) -> tensor<4x4xf32> {
+util.func public @tensorUpdate(%arg0 : tensor<2x2xf32>, %arg1 : tensor<4x4xf32>, %arg2 : index, %arg3 : index) -> tensor<4x4xf32> {
   // CHECK-NEXT: %0 = flow.tensor.update %arg0, %arg1[%arg2, %arg3] : tensor<2x2xf32> -> %arg1 as tensor<4x4xf32>
   %0 = flow.tensor.update %arg0, %arg1[%arg2, %arg3] : tensor<2x2xf32> -> %arg1 as tensor<4x4xf32>
-  return %0 : tensor<4x4xf32>
+  util.return %0 : tensor<4x4xf32>
 }
 
 // CHECK-LABEL: @tensorUpdateDynamic
-func.func @tensorUpdateDynamic(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x4xf32>, %arg2 : index, %arg3 : index) -> tensor<?x4xf32> {
+util.func public @tensorUpdateDynamic(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x4xf32>, %arg2 : index, %arg3 : index) -> tensor<?x4xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
   // CHECK: %0 = flow.tensor.update %arg0, %arg1[%arg2, %arg3] : tensor<?x?xf32>{%c1, %c2} -> %arg1 as tensor<?x4xf32>{%c3}
   %0 = flow.tensor.update %arg0, %arg1[%arg2, %arg3] : tensor<?x?xf32>{%c1, %c2} -> %arg1 as tensor<?x4xf32>{%c3}
-  return %0 : tensor<?x4xf32>
+  util.return %0 : tensor<?x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorTrace
 //  CHECK-SAME: (%[[TENSOR0:.+]]: tensor<5xf32>, %[[TENSOR1:.+]]: tensor<?x3x?xi32>, %[[TENSOR1_DIM0:.+]]: index, %[[TENSOR1_DIM2:.+]]: index)
-func.func @tensorTrace(%tensor0: tensor<5xf32>, %tensor1: tensor<?x3x?xi32>, %tensor1_dim0: index, %tensor1_dim2: index) {
+util.func public @tensorTrace(%tensor0: tensor<5xf32>, %tensor1: tensor<?x3x?xi32>, %tensor1_dim0: index, %tensor1_dim2: index) {
   //      CHECK: flow.tensor.trace "FOOBAR" = [
   // CHECK-SAME:   %[[TENSOR0]] : tensor<5xf32>,
   // CHECK-SAME:   %[[TENSOR1]] : tensor<?x3x?xi32>{%[[TENSOR1_DIM0]], %[[TENSOR1_DIM2]]}
@@ -203,5 +203,5 @@
     %tensor0 : tensor<5xf32>,
     %tensor1 : tensor<?x3x?xi32>{%tensor1_dim0, %tensor1_dim2}
   ]
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/types.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/types.mlir
index 9adead0..5de7246 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/types.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/types.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @dispatchTypes
-func.func @dispatchTypes(
+util.func public @dispatchTypes(
     // CHECK-SAME: %arg0: !flow.dispatch.tensor<readonly:tensor<f32>>
     %arg0: !flow.dispatch.tensor<readonly:tensor<f32>>,
     // CHECK-SAME: %arg1: !flow.dispatch.tensor<readonly:tensor<4x4xf32>>
@@ -23,5 +23,5 @@
     // CHECK-SAME: %arg9: !flow.dispatch.tensor<writeonly:tensor<1x?x3xf32>>
     %arg9: !flow.dispatch.tensor<writeonly:tensor<1x?x3xf32>>
     ) {
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/ExportBenchmarkFuncs.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/ExportBenchmarkFuncs.cpp
index 07db885..47ab089 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/ExportBenchmarkFuncs.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/ExportBenchmarkFuncs.cpp

@@ -15,7 +15,6 @@
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
@@ -194,7 +193,7 @@
 
 static LogicalResult
 createEntryPointBenchmarkFunc(mlir::ModuleOp moduleOp,
-                              mlir::func::FuncOp entryFuncOp,
+                              IREE::Util::FuncOp entryFuncOp,
                               Explorer &explorer) {
   auto symbolTable = explorer.getSymbolTables().getSymbolTable(moduleOp);
   OpBuilder moduleBuilder(moduleOp.getContext());
@@ -216,7 +215,7 @@
 
   // Create a `() -> ()` entry point op the benchmark tool can run.
   Location loc = entryFuncOp.getLoc();
-  auto funcOp = moduleBuilder.create<mlir::func::FuncOp>(
+  auto funcOp = moduleBuilder.create<IREE::Util::FuncOp>(
       loc, funcName, moduleBuilder.getFunctionType({}, {}));
   funcOp.setPublic();
   funcOp->setAttr("iree.abi.stub", moduleBuilder.getUnitAttr());
@@ -236,14 +235,14 @@
                        .createLoadOp(loc, blockBuilder)
                        .getLoadedGlobalValue());
   }
-  auto callOp = blockBuilder.create<mlir::func::CallOp>(loc, entryFuncOp, args);
+  auto callOp = blockBuilder.create<IREE::Util::CallOp>(loc, entryFuncOp, args);
 
   // Sink all results with a barrier to ensure that DCE does not remove the
   // call.
   for (auto result : callOp.getResults()) {
     blockBuilder.create<IREE::Util::OptimizationBarrierOp>(loc, result);
   }
-  blockBuilder.create<mlir::func::ReturnOp>(loc);
+  blockBuilder.create<IREE::Util::ReturnOp>(loc);
 
   // Ensure the original function is not exported and not inlined.
   entryFuncOp->setAttr("noinline", moduleBuilder.getUnitAttr());
@@ -274,8 +273,8 @@
     // Gather the functions we want to wrap for benchmarking and wrap them.
     // Since we are inserting new functions as part of this pass we must perform
     // the wrapping for only the inputs.
-    SmallVector<mlir::func::FuncOp> entryFuncOps;
-    for (auto entryFuncOp : moduleOp.getOps<mlir::func::FuncOp>()) {
+    SmallVector<IREE::Util::FuncOp> entryFuncOps;
+    for (auto entryFuncOp : moduleOp.getOps<IREE::Util::FuncOp>()) {
       if (entryFuncOp.isPublic()) {
         entryFuncOps.push_back(entryFuncOp);
       }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/InsertDispatchDebugTargets.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/InsertDispatchDebugTargets.cpp
index bd417ee..cca18e6 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/InsertDispatchDebugTargets.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/InsertDispatchDebugTargets.cpp

@@ -11,9 +11,10 @@
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
+#include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Regex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
@@ -72,7 +73,7 @@
 // after the op. Updates the function signature to match the return type of the
 // target operation.
 static LogicalResult replaceReturnWithOpResults(mlir::ModuleOp moduleOp,
-                                                mlir::func::FuncOp funcOp,
+                                                IREE::Util::FuncOp funcOp,
                                                 Operation *op) {
   if (!funcOp->isProperAncestor(op))
     return failure();
@@ -110,7 +111,7 @@
 
   // Create the new return and update the function type.
   IRRewriter rewriter(builder);
-  rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(oldTerminator, exports);
+  rewriter.replaceOpWithNewOp<IREE::Util::ReturnOp>(oldTerminator, exports);
 
   SmallVector<Type> argTypes;
   for (const auto &arg : llvm::enumerate(funcOp.getArguments()))
@@ -118,6 +119,7 @@
 
   funcOp.setType(FunctionType::get(context,
                                    /*inputs=*/argTypes, /*results=*/newTypes));
+  funcOp.removeTiedOperandsAttr();
   return success();
 }
 
@@ -147,7 +149,7 @@
       Operation *operation = op;
 
       // Only look for dispatches in upstream func ops.
-      auto funcOp = llvm::dyn_cast<mlir::func::FuncOp>(operation);
+      auto funcOp = llvm::dyn_cast<IREE::Util::FuncOp>(operation);
       if (!funcOp)
         continue;
 
@@ -190,7 +192,8 @@
 struct InsertDebugTargetAtSymbolPass
     : public InsertDebugTargetAtSymbolBase<InsertDebugTargetAtSymbolPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<IREE::Flow::FlowDialect, IREE::HAL::HALDialect>();
+    registry.insert<IREE::Flow::FlowDialect, IREE::HAL::HALDialect,
+                    IREE::Util::UtilDialect>();
   }
   InsertDebugTargetAtSymbolPass(std::string breakStr, std::string traceStr) {
     this->breakDebugTarget = breakStr;
@@ -231,7 +234,7 @@
       // dispatch is not found within the entry block of the function.
       if (breakTarget) {
         Operation *operation = funcOp;
-        auto mlirFuncOp = dyn_cast<mlir::func::FuncOp>(operation);
+        auto mlirFuncOp = dyn_cast<IREE::Util::FuncOp>(operation);
         if (!mlirFuncOp || failed(replaceReturnWithOpResults(
                                getOperation(), mlirFuncOp, breakTarget)))
           return signalPassFailure();

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
index 3ca48ee..fff566d 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp

@@ -158,7 +158,7 @@
 public:
   OutlineDispatchRegionsPass() = default;
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<IREE::Flow::FlowDialect>();
+    registry.insert<func::FuncDialect, IREE::Flow::FlowDialect>();
   }
 
   void runOnOperation() override {

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
index f5b314d..93c510f 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel

@@ -37,7 +37,7 @@
             "fusion_of_tensor_ops.mlir",
             "initialize_empty_tensors.mlir",
             "inject_dispatch_tracing.mlir",
-            "insert_dispatch_debug_markers.mlir",
+            "insert_dispatch_debug_targets.mlir",
             "interchange_generic_ops.mlir",
             "interchange_transpose_generic_ops.mlir",
             "outline_dispatch_externs.mlir",

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt
index 6af5a6e..9f76000 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt

@@ -35,7 +35,7 @@
     "fusion_of_tensor_ops.mlir"
     "initialize_empty_tensors.mlir"
     "inject_dispatch_tracing.mlir"
-    "insert_dispatch_debug_markers.mlir"
+    "insert_dispatch_debug_targets.mlir"
     "interchange_generic_ops.mlir"
     "interchange_transpose_generic_ops.mlir"
     "outline_dispatch_externs.mlir"

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
index 92de50e..7ba9196 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir

@@ -31,14 +31,14 @@
     }
   }
 }
-func.func @main() -> (tensor<4x8xf32>, tensor<8x4xf32>) {
+util.func public @main() -> (tensor<4x8xf32>, tensor<8x4xf32>) {
   %c100 = arith.constant 100 : index
   %c50 = arith.constant 50 : index
   // CHECK: flow.dispatch @ex0::@dispatch0_fill_4x8_f32
   %0 = flow.dispatch @ex0::@dispatch0[%c100, %c50]() : () -> tensor<4x8xf32>
   // CHECK: flow.dispatch @ex1::@dispatch1_fill_8x4_f32
   %1 = flow.dispatch @ex1::@dispatch1[%c100, %c50]() : () -> tensor<8x4xf32>
-  return %0, %1 : tensor<4x8xf32>, tensor<8x4xf32>
+  util.return %0, %1 : tensor<4x8xf32>, tensor<8x4xf32>
 }
 
 // -----

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/capture_dispatch_dynamic_dims.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/capture_dispatch_dynamic_dims.mlir
index 4a8882f..2230b87 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/capture_dispatch_dynamic_dims.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/capture_dispatch_dynamic_dims.mlir

@@ -5,7 +5,7 @@
 
 // CHECK-LABEL: @captureDims
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG0_DIM0:.+]]: index, %[[ARG0_DIM1:.+]]: index, %[[RET0_DIM0:.+]]: index, %[[RET0_DIM1:.+]]: index)
-func.func @captureDims(%arg0: tensor<?x?xf32>, %arg0_dim0: index, %arg0_dim1: index, %ret0_dim0: index, %ret0_dim1: index) {
+util.func public @captureDims(%arg0: tensor<?x?xf32>, %arg0_dim0: index, %arg0_dim1: index, %ret0_dim0: index, %ret0_dim1: index) {
   %c1 = arith.constant 1 : index
   // CHECK: flow.dispatch.workgroups[%c1, %c1, %c1](%[[ARG0]], %[[ARG0_DIM0]], %[[RET0_DIM0]], %[[ARG0_DIM1]], %[[RET0_DIM1]])
   %0 = flow.dispatch.workgroups[%c1, %c1, %c1](%arg0, %arg0_dim0, %ret0_dim0) : (tensor<?x?xf32>{%arg0_dim0, %arg0_dim1}, index, index) -> tensor<?x?xf32>{%ret0_dim0, %ret0_dim1} =
@@ -15,7 +15,7 @@
     // CHECK-DAG: = flow.dispatch.tie_shape %[[RET0_CAPTURE]] : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%[[RET0_DIM0_CAPTURE]], %[[RET0_DIM1_CAPTURE]]}
     flow.return
   }
-  return
+  util.return
 }
 
 // -----
@@ -25,7 +25,7 @@
 
 // CHECK-LABEL: @capture2DimsForOneTensor
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG0_DIM0:.+]]: index, %[[ARG0_DIM1:.+]]: index, %[[RET0_DIM0:.+]]: index, %[[RET0_DIM1:.+]]: index)
-func.func @capture2DimsForOneTensor(%arg0: tensor<?x?xf32>, %arg0_dim0: index, %arg0_dim1: index, %ret0_dim0: index, %ret0_dim1: index) {
+util.func public @capture2DimsForOneTensor(%arg0: tensor<?x?xf32>, %arg0_dim0: index, %arg0_dim1: index, %ret0_dim0: index, %ret0_dim1: index) {
   %c1 = arith.constant 1 : index
   // CHECK: flow.dispatch.workgroups[%c1, %c1, %c1](%[[ARG0]], %[[ARG0_DIM0]], %[[ARG0_DIM1]], %[[RET0_DIM0]], %[[RET0_DIM1]])
   %0 = flow.dispatch.workgroups[%c1, %c1, %c1](%arg0) : (tensor<?x?xf32>{%arg0_dim0, %arg0_dim1}) -> tensor<?x?xf32>{%ret0_dim0, %ret0_dim1} =
@@ -35,7 +35,7 @@
     // CHECK-DAG: = flow.dispatch.tie_shape %[[RET0_CAPTURE]] : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%[[RET0_DIM0_CAPTURE]], %[[RET0_DIM1_CAPTURE]]}
     flow.return
   }
-  return
+  util.return
 }
 
 // -----
@@ -44,7 +44,7 @@
 
 // CHECK-LABEL: @capturedTiedDims
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG0_DIM0:.+]]: index, %[[ARG0_DIM1:.+]]: index)
-func.func @capturedTiedDims(%arg0: tensor<?x?xf32>, %arg0_dim0: index, %arg0_dim1: index) {
+util.func public @capturedTiedDims(%arg0: tensor<?x?xf32>, %arg0_dim0: index, %arg0_dim1: index) {
   %c1 = arith.constant 1 : index
   // CHECK: flow.dispatch.workgroups[%c1, %c1, %c1](%[[ARG0]], %[[ARG0_DIM0]], %[[ARG0_DIM1]])
   %0 = flow.dispatch.workgroups[%c1, %c1, %c1](%arg0, %arg0_dim0) : (tensor<?x?xf32>{%arg0_dim0, %arg0_dim1}, index) -> %arg0{%arg0_dim0, %arg0_dim1} =
@@ -53,5 +53,5 @@
     // CHECK-DAG: = flow.dispatch.tie_shape %[[ARG0_CAPTURE]] : !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%[[ARG0_DIM0_CAPTURE]], %[[ARG0_DIM1_CAPTURE]]}
     flow.return
   }
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/cleanup_tensor_shapes.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/cleanup_tensor_shapes.mlir
index 82ade23..d7963e6 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/cleanup_tensor_shapes.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/cleanup_tensor_shapes.mlir

@@ -4,12 +4,12 @@
 
 // CHECK-LABEL: @stripTieShape
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<?xi32>, %[[ARG1:.+]]: index)
-func.func @stripTieShape(%arg0: tensor<?xi32>, %arg1: index) {
+util.func public @stripTieShape(%arg0: tensor<?xi32>, %arg1: index) {
   // CHECK-NOT: flow.tensor.tie_shape
   %0 = flow.tensor.tie_shape %arg0 : tensor<?xi32>{%arg1}
   // CHECK: util.optimization_barrier %[[ARG0]]
   %1 = util.optimization_barrier %0 : tensor<?xi32>
-  return
+  util.return
 }
 
 // -----
@@ -19,10 +19,10 @@
 // pipeline and if they haven't been by now there's nothing else to lower them
 // into.
 
-func.func @invalidTensorDim(%arg0: tensor<?xi32>) {
+util.func public @invalidTensorDim(%arg0: tensor<?xi32>) {
   %c0 = arith.constant 0 : index
   // expected-error @+1 {{'tensor.dim' op unexpected during shape cleanup}}
   %0 = tensor.dim %arg0, %c0 : tensor<?xi32>
   %1 = util.optimization_barrier %0 : index
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/clone_producers_into_dispatch_regions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/clone_producers_into_dispatch_regions.mlir
index 3393b07..136af8b 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/clone_producers_into_dispatch_regions.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/clone_producers_into_dispatch_regions.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-flow-clone-producers-into-dispatch-regions))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(util.func(iree-flow-clone-producers-into-dispatch-regions))" %s | FileCheck %s
 
-func.func @complex_element_type(%input: tensor<4xi32>, %table: tensor<8x2xcomplex<f32>>) -> tensor<4x2xcomplex<f32>> {
+util.func public @complex_element_type(%input: tensor<4xi32>, %table: tensor<8x2xcomplex<f32>>) -> tensor<4x2xcomplex<f32>> {
   %c4095 = arith.constant 4095 : i32
   %const = arith.constant dense<[
     [(0x7FC00000,0.000000e+00), (0x7FC00000,1.000000e+00)], [(0x7FC00000,2.000000e+00), (0x7FC00000,3.000000e+00)],
@@ -22,10 +22,10 @@
     } -> tensor<4x2xcomplex<f32>>
     flow.return %generic : tensor<4x2xcomplex<f32>>
   }
-  return %0 : tensor<4x2xcomplex<f32>>
+  util.return %0 : tensor<4x2xcomplex<f32>>
 }
 
-// CHECK-LABEL: func.func @complex_element_type
+// CHECK-LABEL: util.func public @complex_element_type
 //       CHECK:   flow.dispatch.region
 //       CHECK:     %[[EMPTY:.+]] = tensor.empty() : tensor<4x2xcomplex<f32>>
 //       CHECK:     %[[CST:.+]] = arith.constant dense<{{.+}}> : tensor<4x2xcomplex<f32>>
@@ -36,7 +36,7 @@
 
 // -----
 
-func.func @complex_constant_clone(%input: tensor<4x2xcomplex<f32>>) -> tensor<4x2xcomplex<f32>> {
+util.func public @complex_constant_clone(%input: tensor<4x2xcomplex<f32>>) -> tensor<4x2xcomplex<f32>> {
   %cst = complex.constant [1.000000e+00 : f32, 2.000000e+00 : f32] : complex<f32>
   %empty = tensor.empty() : tensor<4x2xcomplex<f32>>
   %0 = linalg.fill ins(%cst : complex<f32>) outs(%empty : tensor<4x2xcomplex<f32>>) -> tensor<4x2xcomplex<f32>>
@@ -51,7 +51,7 @@
     } -> tensor<4x2xcomplex<f32>>
     flow.return %generic : tensor<4x2xcomplex<f32>>
   }
-  return %1 : tensor<4x2xcomplex<f32>>
+  util.return %1 : tensor<4x2xcomplex<f32>>
 }
 
 // CHECK-LABEL: @complex_constant_clone
@@ -66,7 +66,7 @@
 
 // -----
 
-func.func @complex_create(%real : f32, %imag : f32, %input: tensor<4x2xcomplex<f32>>) -> tensor<4x2xcomplex<f32>> {
+util.func public @complex_create(%real : f32, %imag : f32, %input: tensor<4x2xcomplex<f32>>) -> tensor<4x2xcomplex<f32>> {
   %cst = complex.create %real, %imag : complex<f32>
   %empty = tensor.empty() : tensor<4x2xcomplex<f32>>
   %0 = linalg.fill ins(%cst : complex<f32>) outs(%empty : tensor<4x2xcomplex<f32>>) -> tensor<4x2xcomplex<f32>>
@@ -81,7 +81,7 @@
     } -> tensor<4x2xcomplex<f32>>
     flow.return %generic : tensor<4x2xcomplex<f32>>
   }
-  return %0 : tensor<4x2xcomplex<f32>>
+  util.return %0 : tensor<4x2xcomplex<f32>>
 }
 
 // CHECK-LABEL: @complex_create
@@ -96,7 +96,7 @@
 
 // -----
 
-func.func @use_in_dispatch_count(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<i32> {
+util.func public @use_in_dispatch_count(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<i32> {
   %c1 = arith.constant 1 : index
   %c2_i32 = arith.constant 2 : i32
   %c0 = arith.constant 0 : index
@@ -112,7 +112,7 @@
   } count() -> (index, index, index) {
     flow.return %c1, %c1, %c1 : index, index, index
   }
-  return %4 : tensor<i32>
+  util.return %4 : tensor<i32>
 }
 
 
@@ -126,7 +126,7 @@
 
 // -----
 
-func.func @clone_dequantization(%arg0: tensor<4096x32x128xi8>, %arg1: tensor<1x1x32x128xf32>, %arg2: tensor<4096x32x1xf32>, %arg3: tensor<4096x32x1xf32>) -> tensor<1x1x4096xf32> {
+util.func public @clone_dequantization(%arg0: tensor<4096x32x128xi8>, %arg1: tensor<1x1x32x128xf32>, %arg2: tensor<4096x32x1xf32>, %arg3: tensor<4096x32x1xf32>) -> tensor<1x1x4096xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x1x4096xf32>
   %1 = tensor.empty() : tensor<4096x32x128xf32>
@@ -159,9 +159,9 @@
     } -> tensor<1x1x4096xf32>
     flow.return %4 : tensor<1x1x4096xf32>
   }
-  return %9 : tensor<1x1x4096xf32>
+  util.return %9 : tensor<1x1x4096xf32>
 }
-//       CHECK: func.func @clone_dequantization
+//       CHECK: util.func public @clone_dequantization
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<4096x32x128xi8>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<1x1x32x128xf32>
 //  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<4096x32x1xf32>
@@ -181,13 +181,13 @@
 //  CHECK-SAME:       ins(%[[ARG1]], %[[GEN0]] :
 //  CHECK-SAME:       outs(%[[FILL]] :
 //       CHECK:   flow.return %[[GEN1]] :
-//       CHECK:   return %[[DISP]]
+//       CHECK:   util.return %[[DISP]]
 
 // -----
 
 #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 module {
-  func.func @clone_dequantization_like(%arg0: tensor<32x1x16x1x8xi16>, %arg1: tensor<32x344x16x32x8xi4>) -> tensor<32x1x344x1x32xi32> {
+  util.func public @clone_dequantization_like(%arg0: tensor<32x1x16x1x8xi16>, %arg1: tensor<32x344x16x32x8xi4>) -> tensor<32x1x344x1x32xi32> {
     %c0_i32 = arith.constant 0 : i32
     %0 = tensor.empty() : tensor<32x1x16x1x8xi32>
     %1 = linalg.generic {indexing_maps = [#map, #map],
@@ -211,10 +211,10 @@
       %7 = linalg.batch_mmt4d ins(%1, %3 : tensor<32x1x16x1x8xi32>, tensor<32x344x16x32x8xi32>) outs(%5 : tensor<32x1x344x1x32xi32>) -> tensor<32x1x344x1x32xi32>
       flow.return %7 : tensor<32x1x344x1x32xi32>
     }
-    return %6 : tensor<32x1x344x1x32xi32>
+    util.return %6 : tensor<32x1x344x1x32xi32>
   }
 }
-//       CHECK: func.func @clone_dequantization
+//       CHECK: util.func public @clone_dequantization
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<32x1x16x1x8xi16>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<32x344x16x32x8xi4>
 //       CHECK:   %[[DISP:.+]] = flow.dispatch.region -> (tensor<32x1x344x1x32xi32>)
@@ -236,4 +236,4 @@
 //  CHECK-SAME:       ins(%[[GEN0]], %[[GEN1]] :
 //  CHECK-SAME:       outs(%[[FILL]] :
 //       CHECK:   flow.return %[[MMT4D]] :
-//       CHECK:   return %[[DISP]]
+//       CHECK:   util.return %[[DISP]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_dimensions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_dimensions.mlir
index 3417108..01a289d 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_dimensions.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_dimensions.mlir

@@ -1,7 +1,7 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-flow-collapse-dimensions))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(util.func(iree-flow-collapse-dimensions))" %s | FileCheck %s
 
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-func.func @do_not_collapse_cst_in_place(%arg0: tensor<1x1x2304xf32>) {
+util.func public @do_not_collapse_cst_in_place(%arg0: tensor<1x1x2304xf32>) {
   %cst = arith.constant dense<0.000000e+00> : tensor<1x1x2304xf32>
   %0 = tensor.empty() : tensor<1x1x2304xf32>
   %1 = flow.dispatch.region -> (tensor<1x1x2304xf32>) {
@@ -13,9 +13,9 @@
     } -> tensor<1x1x2304xf32>
     flow.return %3 : tensor<1x1x2304xf32>
   }
-  return
+  util.return
 }
-// CHECK-LABEL: func.func @do_not_collapse_cst_in_place
+// CHECK-LABEL: util.func public @do_not_collapse_cst_in_place
 // CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]]]
 // CHECK-DAG:     %[[CST:.+]] = arith.constant
 // CHECK-DAG:     %[[COLLAPSED_ARG0:.+]] = tensor.collapse_shape %[[ARG0]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
index 4ca4637..531161d 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_linalg_generic_on_tensors.mlir

@@ -1,8 +1,8 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-collapse-dimensions, cse))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(util.func(iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-collapse-dimensions, cse))" %s | FileCheck %s
 !type = tensor<2x4x8x16x32x64xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type
 
-func.func @collapse1() -> !type {
+util.func public @collapse1() -> !type {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
@@ -19,12 +19,12 @@
     ^bb0(%arg1: f32, %arg2: f32):
       linalg.yield %arg1 : f32
     } -> !type
-  return %6: !type
+  util.return %6: !type
 
 }
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0)>
-// CHECK-LABEL: func.func @collapse1
+// CHECK-LABEL: util.func public @collapse1
 //       CHECK:   %[[IN:.+]] = tensor.collapse_shape %[[INPUT:.+]] {{\[}}[0, 1, 2, 3, 4, 5]] : tensor<2x4x8x16x32x64xf32> into tensor<2097152xf32>
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     %[[OUT:.+]] = tensor.empty() : tensor<2097152xf32>
@@ -37,7 +37,7 @@
 !type = tensor<2x4x8x32x32x64x128xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type
 
-func.func @collapse2() -> !type {
+util.func public @collapse2() -> !type {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
@@ -54,13 +54,13 @@
     ^bb0(%arg1: f32, %arg2: f32):
       linalg.yield %arg1 : f32
     } -> !type
-  return %6: !type
+  util.return %6: !type
 
 }
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d2, d4)>
 //       CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-// CHECK-LABEL: func.func @collapse2
+// CHECK-LABEL: util.func public @collapse2
 //       CHECK:   %[[IN:.+]] = tensor.collapse_shape %[[INPUT:.+]] {{\[}}[0, 1], [2], [3], [4], [5, 6]] : tensor<2x4x8x32x32x64x128xf32> into tensor<8x8x32x32x8192xf32>
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     %[[OUT:.+]] = tensor.empty() : tensor<8x8x32x32x8192xf32>
@@ -72,7 +72,7 @@
 !type = tensor<2x4x8x16x32x64x128x256xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type
 
-func.func @collapse3() -> !type {
+util.func public @collapse3() -> !type {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
@@ -89,12 +89,12 @@
   ^bb0(%arg1: f32, %arg2: f32):
     linalg.yield %arg1 : f32
   } -> !type
-  return %result: !type
+  util.return %result: !type
 
 }
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-// CHECK-LABEL: func.func @collapse3
+// CHECK-LABEL: util.func public @collapse3
 //       CHECK:   %[[IN:.+]] = tensor.collapse_shape %[[INPUT:.+]] {{\[}}[0, 1], [2], [3, 4, 5, 6, 7]] : tensor<2x4x8x16x32x64x128x256xf32> into tensor<8x8x1073741824xf32>
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     %[[OUT:.+]] = tensor.empty() : tensor<8x8x1073741824xf32>
@@ -106,7 +106,7 @@
 
 !type = tensor<2x4x8x16x64x64x128x256xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type
-func.func @collapse4() -> !type {
+util.func public @collapse4() -> !type {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
@@ -123,13 +123,13 @@
   ^bb0(%arg1: f32, %arg2: f32):
     linalg.yield %arg1 : f32
   } -> !type
-  return %result: !type
+  util.return %result: !type
 
 }
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
 //       CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>
-// CHECK-LABEL: func.func @collapse4
+// CHECK-LABEL: util.func public @collapse4
 //       CHECK:   %[[IN:.+]] = tensor.collapse_shape %[[INPUT:.+]] {{\[}}[0, 1], [2], [3], [4], [5], [6, 7]] : tensor<2x4x8x16x64x64x128x256xf32> into tensor<8x8x16x64x64x32768xf32>
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     %[[OUT:.+]] = tensor.empty() : tensor<8x8x16x64x64x32768xf32>
@@ -141,7 +141,7 @@
 
 !type = tensor<2x4x32x32x32x64x128x256xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type
-func.func @collapse5() -> !type {
+util.func public @collapse5() -> !type {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
@@ -163,14 +163,14 @@
   ^bb0(%arg1: f32, %arg2: f32, %arg3: f32, %arg4: f32):
     linalg.yield %arg1 : f32
   } -> !type
-  return %result: !type
+  util.return %result: !type
 
 }
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
 //       CHECK: #[[$MAP1:.+]] =  affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d2, d4, d5)>
 //       CHECK: #[[$MAP2:.+]] =  affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d3, d2, d1, d4, d5)>
-// CHECK-LABEL: func.func @collapse5
+// CHECK-LABEL: util.func public @collapse5
 //       CHECK:   %[[IN:.+]] = tensor.collapse_shape %[[INPUT:.+]] {{\[}}[0, 1], [2], [3], [4], [5], [6, 7]] : tensor<2x4x32x32x32x64x128x256xf32> into tensor<8x32x32x32x64x32768xf32>
 //       CHECK:   %[[IN1:.+]] = tensor.collapse_shape %[[INPUT1:.+]] {{\[}}[0, 1], [2], [3], [4], [5], [6, 7]] : tensor<2x4x32x32x32x64x128x256xf32> into tensor<8x32x32x32x64x32768xf32>
 //       CHECK:   %[[IN2:.+]] = tensor.collapse_shape %[[INPUT2:.+]] {{\[}}[0, 1], [2], [3], [4], [5], [6, 7]] : tensor<2x4x32x32x32x64x128x256xf32> into tensor<8x32x32x32x64x32768xf32>
@@ -184,7 +184,7 @@
 
 !type = tensor<32x2x4x8x16x16x64x128xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type
-func.func @collapse6() -> !type {
+util.func public @collapse6() -> !type {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
@@ -201,13 +201,13 @@
   ^bb0(%arg1: f32, %arg2: f32):
     linalg.yield %arg1 : f32
   } -> !type
-  return %result: !type
+  util.return %result: !type
 
 }
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
 //       CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d4, d3, d5)>
-// CHECK-LABEL: func.func @collapse6
+// CHECK-LABEL: util.func public @collapse6
 //       CHECK:   %[[IN:.+]] = tensor.collapse_shape %[[INPUT:.+]] {{\[}}[0], [1], [2, 3], [4], [5], [6, 7]] : tensor<32x2x4x8x16x16x64x128xf32> into tensor<32x2x32x16x16x8192xf32>
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     %[[OUT:.+]] = tensor.empty() : tensor<32x2x32x16x16x8192xf32>
@@ -220,7 +220,7 @@
 !type_out = tensor<2x4x8x16xf32>
 !type_in = tensor<2x4x8xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type_in
-func.func @collapse7() -> !type_out {
+util.func public @collapse7() -> !type_out {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type_in>
@@ -236,12 +236,12 @@
   ^bb0(%arg1: f32, %arg2: f32):
     linalg.yield %arg1 : f32
   } -> !type_out
-  return %result: !type_out
+  util.return %result: !type_out
 }
 
 //       CHECK: #[[$MAP:.+]] =  affine_map<(d0, d1) -> (d1)>
 //       CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK-LABEL: func.func @collapse7
+// CHECK-LABEL: util.func public @collapse7
 //       CHECK:   %[[IN:.+]] = tensor.collapse_shape %[[INPUT:.+]] {{\[}}[0, 1, 2]] : tensor<2x4x8xf32> into tensor<64xf32>
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     %[[OUT:.+]] = tensor.empty() : tensor<64x16xf32>
@@ -253,7 +253,7 @@
 
 !type_in = tensor<16x4x32x2xf32>
 !type_out = tensor<8x16x4x32x8x2xf32>
-func.func @collapse8(%input : !type_in) -> !type_out {
+util.func public @collapse8(%input : !type_in) -> !type_out {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %output = tensor.empty() : !type_out
@@ -268,12 +268,12 @@
       %11 = arith.addf %arg1, %arg2 : f32
       linalg.yield %11 : f32
     } -> !type_out
-  return %6: !type_out
+  util.return %6: !type_out
 }
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d1, d3)>
 //       CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-// CHECK-LABEL: func.func @collapse8
+// CHECK-LABEL: util.func public @collapse8
 //  CHECK-SAME:     (%[[IN:.+]]: tensor<16x4x32x2xf32>)
 //       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape %[[IN]] {{\[}}[0, 1, 2], [3]{{\]}}
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
@@ -286,7 +286,7 @@
 
 !type_in = tensor<16x4xf32>
 !type_out = tensor<16x32x4xf32>
-func.func @dont_collapse() -> !type_out {
+util.func public @dont_collapse() -> !type_out {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input = tensor.empty() : !type_in
@@ -301,9 +301,9 @@
       %11 = arith.addf %arg1, %arg2 : f32
       linalg.yield %11 : f32
     } -> !type_out
-  return %6: !type_out
+  util.return %6: !type_out
 }
-// CHECK-LABEL: func.func @dont_collapse
+// CHECK-LABEL: util.func public @dont_collapse
 //       CHECK:   linalg.generic {indexing_maps = [#[[$MAP:.+]], #[[$MAP2:.+]]], iterator_types = ["parallel", "parallel", "parallel"]}
 
 // -----
@@ -312,7 +312,7 @@
 !type_out = tensor<2x4x16x64x32x128x256xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type_in
 
-func.func @collapse9() -> !type_out {
+util.func public @collapse9() -> !type_out {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type_in>
@@ -329,13 +329,13 @@
   ^bb0(%arg1: f32, %arg2: f32):
     linalg.yield %arg1 : f32
   } -> !type_out
-  return %result: !type_out
+  util.return %result: !type_out
 }
 
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
 //       CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d3, d5)>
-// CHECK-LABEL: func.func @collapse9
+// CHECK-LABEL: util.func public @collapse9
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     linalg.generic {indexing_maps = [#[[$MAP]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel", "parallel", "parallel"]}
 
@@ -345,7 +345,7 @@
 !type_in = tensor<10x10x30xf32>
 !type_out = tensor<20x10x10x30x20xf32>
 
-func.func @collapse10(%input : !type_in) -> !type_out {
+util.func public @collapse10(%input : !type_in) -> !type_out {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %output = tensor.empty() : !type_out
@@ -360,10 +360,10 @@
     linalg.yield %arg1 : f32
   }  -> !type_out
 
-  return %result: !type_out
+  util.return %result: !type_out
 }
 
-// CHECK-LABEL: func.func @collapse10
+// CHECK-LABEL: util.func public @collapse10
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     linalg.generic {indexing_maps = [#[[$MAP]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel"]}
 
@@ -372,7 +372,7 @@
 !type_in =  tensor<10x20xf32>
 !type_out =  tensor<10x20xf32>
 
-func.func @collapse11(%input : !type_in) -> !type_out {
+util.func public @collapse11(%input : !type_in) -> !type_out {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %output = tensor.empty() : !type_out
@@ -386,19 +386,19 @@
     linalg.yield %arg1 : f32
   }  -> !type_out
 
-  return %result: !type_out
+  util.return %result: !type_out
 }
 
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0)>
-// CHECK-LABEL: func.func @collapse11
+// CHECK-LABEL: util.func public @collapse11
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     linalg.generic {indexing_maps = [#[[$MAP]], #[[$MAP]]], iterator_types = ["parallel"]}
 
 // -----
 
 !type = tensor<16x32xi32>
-func.func @dont_collapse_dueto_index(%height : index, %width : index) -> !type {
+util.func public @dont_collapse_dueto_index(%height : index, %width : index) -> !type {
   %init_source = tensor.empty() : !type
   %source = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
@@ -412,10 +412,10 @@
       %linearized_i32 = arith.index_cast %linearized : index to i32
       linalg.yield %linearized_i32 : i32
   } -> !type
-  return %source : !type
+  util.return %source : !type
 }
 
-// CHECK-LABEL: func.func @dont_collapse
+// CHECK-LABEL: util.func public @dont_collapse
 //       CHECK:   linalg.generic {indexing_maps = [#[[$MAP:.+]]], iterator_types = ["parallel", "parallel"]}
 
 // -----
@@ -423,7 +423,7 @@
 !type = tensor<2x4x8x16x32x64xf32>
 util.global private @"__transpose_10_input" {inlining_policy = #util.inline.never} = dense<1.0> : !type
 
-func.func @collapse12() -> (!type,!type,!type,!type) {
+util.func public @collapse12() -> (!type,!type,!type,!type) {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
   %input_ptr = util.global.address @"__transpose_10_input" : !util.ptr<!type>
@@ -449,17 +449,17 @@
       %3 = arith.addf %2, %arg5 : f32
       linalg.yield %0,%1,%2,%3 : f32, f32, f32, f32
     } -> (!type,!type,!type,!type)
-  return %6, %7, %8, %9  : !type,!type,!type,!type
+  util.return %6, %7, %8, %9  : !type,!type,!type,!type
 }
 
 //       CHECK: #[[$MAP:.+]] = affine_map<(d0) -> (d0)>
-// CHECK-LABEL: func.func @collapse12
+// CHECK-LABEL: util.func public @collapse12
 //       CHECK:   %[[RES:.+]] = flow.dispatch.region
 //       CHECK:     linalg.generic {indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP]], #[[$MAP]], #[[$MAP]]], iterator_types = ["parallel"]}
 
 // -----
 
-func.func @multi_reduce_dim(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
+util.func public @multi_reduce_dim(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
   %cst = arith.constant -0.000000e+00 : f32
   %1 = tensor.empty() : tensor<2x32xf32>
   %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x32xf32>) -> tensor<2x32xf32>
@@ -469,7 +469,7 @@
     linalg.yield %6 : f32
   } -> tensor<2x32xf32>
   %4 = tensor.expand_shape %3 [[0], [1, 2, 3]] : tensor<2x32xf32> into tensor<2x32x1x1xf32>
-  return %4 : tensor<2x32x1x1xf32>
+  util.return %4 : tensor<2x32x1x1xf32>
 }
 
 // Check that we collapse dimensions.
@@ -491,7 +491,7 @@
 // Collapsing is not supported when an input is broadcasted; we can't collapse
 // the input from tensor<4xf32> to tensor<32xf32> for example.
 
-func.func @input_broadcast(%arg0: tensor<4x8xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
+util.func public @input_broadcast(%arg0: tensor<4x8xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
   %empty = tensor.empty() : tensor<f32>
   %reduce = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> ()>], iterator_types = ["reduction", "reduction"]} ins(%arg0, %arg1 : tensor<4x8xf32>, tensor<4xf32>) outs(%empty : tensor<f32>) {
   ^bb0(%arg2: f32, %arg3: f32, %out: f32):
@@ -499,7 +499,7 @@
     %add = arith.addf %out, %div : f32
     linalg.yield %add : f32
   } -> tensor<f32>
-  return %reduce : tensor<f32>
+  util.return %reduce : tensor<f32>
 }
 
 //     CHECK: @input_broadcast
@@ -515,7 +515,7 @@
 #map3 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>
 #map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
 module {
-  func.func @quantized_matmul(%arg0: tensor<4096x32x128xi8>, %arg1: tensor<1x1x32x128xf32>) -> tensor<1x1x4096xf32> {
+  util.func public @quantized_matmul(%arg0: tensor<4096x32x128xi8>, %arg1: tensor<1x1x32x128xf32>) -> tensor<1x1x4096xf32> {
     %cst = arith.constant dense_resource<__elided__> : tensor<4096x32xf32>
     %cst_0 = arith.constant dense_resource<__elided__> : tensor<4096x32xf32>
     %0 = flow.dispatch.region -> (tensor<1x1x4096xf32>) {
@@ -539,23 +539,23 @@
       } -> tensor<1x1x4096xf32>
       flow.return %5 : tensor<1x1x4096xf32>
     }
-    return %0 : tensor<1x1x4096xf32>
+    util.return %0 : tensor<1x1x4096xf32>
   }
 }
 
-// CHECK-LABEL:  func.func @quantized_matmul
+// CHECK-LABEL:  util.func public @quantized_matmul
 //       CHECK:    %[[DISPATCH:.+]] = flow.dispatch.region
 //       CHECK:      linalg.generic
 //  CHECK-SAME:          iterator_types = ["parallel", "parallel", "parallel"]
 //       CHECK:      linalg.generic
 //  CHECK-SAME:          iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
 //       CHECK:      flow.return
-//       CHECK:    return %[[DISPATCH]]
+//       CHECK:    util.return %[[DISPATCH]]
 
 // -----
 
 module {
-  func.func @batchnorm_failure_repro(%arg0 : tensor<2x4xf32>, %arg1 : tensor<4xf32>) -> tensor<2x4xf32> {
+  util.func public @batchnorm_failure_repro(%arg0 : tensor<2x4xf32>, %arg1 : tensor<4xf32>) -> tensor<2x4xf32> {
     %0 = tensor.empty() : tensor<2x4xf32>
     %1 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>],
@@ -565,20 +565,20 @@
         %2 = arith.addf %b0, %b1 : f32
         linalg.yield %2 : f32
     } -> tensor<2x4xf32>
-    return %1 : tensor<2x4xf32>
+    util.return %1 : tensor<2x4xf32>
   }
 }
-// CHECK-LABEL: func @batchnorm_failure_repro
+// CHECK-LABEL: util.func public @batchnorm_failure_repro
 //       CHECK:   %[[DISPATCH:.+]] = flow.dispatch.region
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:         iterator_types = ["parallel", "parallel"]
 //       CHECK:     flow.return %[[GENERIC]]
-//       CHECK:   return %[[DISPATCH]]
+//       CHECK:   util.return %[[DISPATCH]]
 
 // -----
 
 module {
-  func.func @catch_invalid_collapse(%arg0 : tensor<10x20x30xf32>) -> tensor<10x30x40xf32> {
+  util.func public @catch_invalid_collapse(%arg0 : tensor<10x20x30xf32>) -> tensor<10x30x40xf32> {
     %0 = tensor.empty() : tensor<10x30x40xf32>
     %1 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>],
@@ -587,9 +587,9 @@
       ^bb0(%b0 : f32, %b1 : f32):
         linalg.yield %b0 : f32
     } -> tensor<10x30x40xf32>
-    return %1 : tensor<10x30x40xf32>
+    util.return %1 : tensor<10x30x40xf32>
   }
 }
-// CHECK-LABEL: func @catch_invalid_collapse
+// CHECK-LABEL: util.func public @catch_invalid_collapse
 //       CHECK:   linalg.generic
 //  CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "parallel"]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_reduction.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_reduction.mlir
index 320cb4d..3a87df7 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_reduction.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/collapse_reduction.mlir

@@ -1,6 +1,6 @@
 // RUN: iree-opt --split-input-file -iree-flow-collapse-dims %s | FileCheck %s
 
-func.func @multi_reduce_dim(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
+util.func public @multi_reduce_dim(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
   %cst = arith.constant -0.000000e+00 : f32
   %1 = tensor.empty() : tensor<2x32xf32>
   %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x32xf32>) -> tensor<2x32xf32>
@@ -10,7 +10,7 @@
     linalg.yield %6 : f32
   } -> tensor<2x32xf32>
   %4 = tensor.expand_shape %3 [[0], [1, 2, 3]] : tensor<2x32xf32> into tensor<2x32x1x1xf32>
-  return %4 : tensor<2x32x1x1xf32>
+  util.return %4 : tensor<2x32x1x1xf32>
 }
 
 // Check that we collapse dimensions.
@@ -22,7 +22,7 @@
 // Collapsing is not supported when an input is broadcasted; we can't collapse
 // the input from tensor<4xf32> to tensor<32xf32> for example.
 
-func.func @input_broadcast(%arg0: tensor<4x8xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
+util.func public @input_broadcast(%arg0: tensor<4x8xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
   %empty = tensor.empty() : tensor<f32>
   %reduce = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> ()>], iterator_types = ["reduction", "reduction"]} ins(%arg0, %arg1 : tensor<4x8xf32>, tensor<4xf32>) outs(%empty : tensor<f32>) {
   ^bb0(%arg2: f32, %arg3: f32, %out: f32):
@@ -30,7 +30,7 @@
     %add = arith.addf %out, %div : f32
     linalg.yield %add : f32
   } -> tensor<f32>
-  return %reduce : tensor<f32>
+  util.return %reduce : tensor<f32>
 }
 
 // CHECK: @input_broadcast
@@ -40,7 +40,7 @@
 
 // Collapsing should not happen to ops in flow.dispatch.region or flow.dispatch.workgroups
 
-func.func @multi_reduce_dim_dispatch(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
+util.func public @multi_reduce_dim_dispatch(%arg0: tensor<2x32x10x4096xf32>) -> tensor<2x32x1x1xf32> {
   %cst = arith.constant -0.000000e+00 : f32
   %1 = tensor.empty() : tensor<2x32xf32>
   %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x32xf32>) -> tensor<2x32xf32>
@@ -53,7 +53,7 @@
     flow.return %6 : tensor<2x32xf32>
   }
   %4 = tensor.expand_shape %3 [[0], [1, 2, 3]] : tensor<2x32xf32> into tensor<2x32x1x1xf32>
-  return %4 : tensor<2x32x1x1xf32>
+  util.return %4 : tensor<2x32x1x1xf32>
 }
 
 // CHECK: @multi_reduce_dim_dispatch

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir
index 19139b2..044dec3 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir

@@ -1,8 +1,8 @@
 // RUN: iree-opt %s --iree-flow-convert-region-to-workgroups -canonicalize -cse -split-input-file | FileCheck %s
 
-// CHECK-LABEL: func @foo(
+// CHECK-LABEL: util.func public @foo(
 //       CHECK:   %[[argA:.*]]: tensor<?x?xf32>, %[[argB:.*]]: tensor<5x10xf32>, %[[argC:.*]]: tensor<10x11xf32>
-func.func @foo(%argA: tensor<?x?xf32>, %argB: tensor<5x10xf32>, %argC: tensor<10x11xf32>) -> (tensor<?x?xf32>, tensor<5x11xf32>) {
+util.func public @foo(%argA: tensor<?x?xf32>, %argB: tensor<5x10xf32>, %argC: tensor<10x11xf32>) -> (tensor<?x?xf32>, tensor<5x11xf32>) {
   //  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
   //  CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
   //  CHECK-DAG: %[[dim_argA_0:.*]] = tensor.dim %[[argA]], %[[c0]]
@@ -40,6 +40,6 @@
     flow.return %2 : tensor<5x11xf32>
   }
 
-  //      CHECK: return %[[r0]], %[[r1]]
-  return %r0, %r1 : tensor<?x?xf32>, tensor<5x11xf32>
+  //      CHECK: util.return %[[r0]], %[[r1]]
+  util.return %r0, %r1 : tensor<?x?xf32>, tensor<5x11xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/deduplicate_executables.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/deduplicate_executables.mlir
index c28ffef..8ce0a36 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/deduplicate_executables.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/deduplicate_executables.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --iree-flow-deduplicate-executables %s | FileCheck %s
 
 // CHECK-LABEL: flow.executable public @single_executable_ex_0
-flow.executable @single_executable_ex_0 {
+flow.executable public @single_executable_ex_0 {
   flow.executable.export @single_executable_entry_0
   builtin.module {
     func.func @single_executable_entry_0(%arg0: tensor<4xf32>) -> tensor<4xf32> {
@@ -10,18 +10,18 @@
     }
   }
 }
-// CHECK-LABEL: func.func @single_executable
-func.func @single_executable(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL: util.func public @single_executable
+util.func public @single_executable(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.dispatch @single_executable_ex_0::@single_executable_entry_0[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   %0 = flow.dispatch @single_executable_ex_0::@single_executable_entry_0[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: flow.executable public @duplicate_executables_ex_0
-flow.executable @duplicate_executables_ex_0 {
+flow.executable public @duplicate_executables_ex_0 {
   flow.executable.export @duplicate_executables_entry_0
   builtin.module {
     func.func @duplicate_executables_entry_0(%arg0: tensor<4xf32>) -> tensor<4xf32> {
@@ -31,7 +31,7 @@
   }
 }
 // CHECK-NOT: flow.executable public @duplicate_executables_ex_1
-flow.executable @duplicate_executables_ex_1 {
+flow.executable public @duplicate_executables_ex_1 {
   flow.executable.export @duplicate_executables_entry_1
   builtin.module {
     func.func @duplicate_executables_entry_1(%arg0: tensor<4xf32>) -> tensor<4xf32> {
@@ -41,7 +41,7 @@
   }
 }
 // CHECK-LABEL: flow.executable public @duplicate_executables_ex_2
-flow.executable @duplicate_executables_ex_2 {
+flow.executable public @duplicate_executables_ex_2 {
   flow.executable.export @duplicate_executables_entry_2
   builtin.module {
     func.func @duplicate_executables_entry_2(%arg0: tensor<4xf32>) -> tensor<4xf32> {
@@ -50,8 +50,8 @@
     }
   }
 }
-// CHECK-LABEL: func.func @duplicate_executables
-func.func @duplicate_executables(%arg0: tensor<4xf32>) {
+// CHECK-LABEL: util.func public @duplicate_executables
+util.func public @duplicate_executables(%arg0: tensor<4xf32>) {
   %c4 = arith.constant 4 : index
   // CHECK: = flow.dispatch @duplicate_executables_ex_0::@duplicate_executables_entry_0[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   %0 = flow.dispatch @duplicate_executables_ex_0::@duplicate_executables_entry_0[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
@@ -61,7 +61,7 @@
   %2 = flow.dispatch @duplicate_executables_ex_2::@duplicate_executables_entry_2[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK: = flow.dispatch {@duplicate_executables_ex_0::@duplicate_executables_entry_0, @duplicate_executables_ex_0::@duplicate_executables_entry_0}
   %3 = flow.dispatch {@duplicate_executables_ex_0::@duplicate_executables_entry_0, @duplicate_executables_ex_1::@duplicate_executables_entry_1}[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
-  return
+  util.return
 }
 
 // Ensure that symbol renaming is done within initializers.
@@ -97,14 +97,14 @@
     }
   }
 }
-// CHECK-LABEL: func.func @same_ops_diff_operands
-func.func @same_ops_diff_operands(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK-LABEL: util.func public @same_ops_diff_operands
+util.func public @same_ops_diff_operands(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi32> {
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.dispatch @same_ops_diff_operands_ex_0::@entry_0[%c4](%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   %0 = flow.dispatch @same_ops_diff_operands_ex_0::@entry_0[%c4](%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   // CHECK: %1 = flow.dispatch @same_ops_diff_operands_ex_1::@entry_1[%c4](%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   %1 = flow.dispatch @same_ops_diff_operands_ex_1::@entry_1[%c4](%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0 : tensor<2xi32>
+  util.return %0 : tensor<2xi32>
 }
 
 // -----
@@ -139,8 +139,8 @@
     }
   }
 }
-// CHECK-LABEL: func.func @multiple_entry_points
-func.func @multiple_entry_points(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL: util.func public @multiple_entry_points
+util.func public @multiple_entry_points(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: %[[C4:.*]] = arith.constant 4
   %c4 = arith.constant 4 : index
   // CHECK:      {{.*}} = flow.dispatch @multiple_entry_points_ex_0::@multiple_entry_points_0_entry_0[%[[C4]]](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
@@ -151,7 +151,7 @@
   %2 = flow.dispatch @multiple_entry_points_ex_1::@multiple_entry_points_1_entry_0[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: {{.*}} = flow.dispatch @multiple_entry_points_ex_0::@multiple_entry_points_0_entry_1[%[[C4]]](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   %3 = flow.dispatch @multiple_entry_points_ex_1::@multiple_entry_points_1_entry_1[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
@@ -176,14 +176,14 @@
     }
   }
 }
-// CHECK-LABEL: func.func @different_types
-func.func @different_types(%arg0: tensor<4xf32>) -> tensor<4xi1> {
+// CHECK-LABEL: util.func public @different_types
+util.func public @different_types(%arg0: tensor<4xf32>) -> tensor<4xi1> {
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.dispatch @different_types_float_ex::@different_types_float_entry[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xi1>
   %0 = flow.dispatch @different_types_float_ex::@different_types_float_entry[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xi1>
   // CHECK: %1 = flow.dispatch @different_types_int_ex::@different_types_int_entry[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xi1>
   %1 = flow.dispatch @different_types_int_ex::@different_types_int_entry[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xi1>
-  return %0 : tensor<4xi1>
+  util.return %0 : tensor<4xi1>
 }
 
 // -----
@@ -234,8 +234,8 @@
     }
   }
 }
-// CHECK-LABEL: func.func @nested_ops
-func.func @nested_ops(%arg0: tensor<5x6xf32>, %arg1: tensor<5x6xf32>) -> tensor<5x6xf32> {
+// CHECK-LABEL: util.func public @nested_ops
+util.func public @nested_ops(%arg0: tensor<5x6xf32>, %arg1: tensor<5x6xf32>) -> tensor<5x6xf32> {
   %c4 = arith.constant 4 : index
   // CHECK: %0 = flow.dispatch @nested_ops_ex_0::@nested_ops_entry_0[%c4](%arg0, %arg1) : (tensor<5x6xf32>, tensor<5x6xf32>) -> tensor<5x6xf32>
   %0 = flow.dispatch @nested_ops_ex_0::@nested_ops_entry_0[%c4](%arg0, %arg1) : (tensor<5x6xf32>, tensor<5x6xf32>) -> tensor<5x6xf32>
@@ -243,7 +243,7 @@
   %1 = flow.dispatch @nested_ops_ex_0::@nested_ops_entry_0[%c4](%arg0, %arg1) : (tensor<5x6xf32>, tensor<5x6xf32>) -> tensor<5x6xf32>
   // CHECK: %2 = flow.dispatch @nested_ops_ex_2::@nested_ops_entry_2[%c4](%arg0, %arg1) : (tensor<5x6xf32>, tensor<5x6xf32>) -> tensor<5x6xf32>
   %2 = flow.dispatch @nested_ops_ex_2::@nested_ops_entry_2[%c4](%arg0, %arg1) : (tensor<5x6xf32>, tensor<5x6xf32>) -> tensor<5x6xf32>
-  return %0 : tensor<5x6xf32>
+  util.return %0 : tensor<5x6xf32>
 }
 
 // -----
@@ -417,13 +417,13 @@
   }
 }
 
-// CHECK-LABEL: func.func @dispatch_variants
-func.func @dispatch_variants(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL: util.func public @dispatch_variants
+util.func public @dispatch_variants(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: %[[C4:.*]] = arith.constant 4
   %c4 = arith.constant 4 : index
   // CHECK:      {{.*}} = flow.dispatch @ex0::@variant::@dispatch[%[[C4]]](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   %0 = flow.dispatch @ex0::@variant::@dispatch[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: {{.*}} = flow.dispatch @ex0::@variant::@dispatch[%[[C4]]](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   %1 = flow.dispatch @ex1::@variant::@dispatch[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
-  return %1 : tensor<4xf32>
+  util.return %1 : tensor<4xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
index e77d1be..0b53662 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir

@@ -1,12 +1,12 @@
-// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-form-dispatch-workgroups), cse, canonicalize, cse)" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(util.func(iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-clone-producers-into-dispatch-regions, iree-flow-form-dispatch-workgroups), cse, canonicalize, cse)" %s | FileCheck %s
 
-func.func @tile_matmul_alone(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @tile_matmul_alone(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
              %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %1 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
     outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-//      CHECK: func.func @tile_matmul_alone
+//      CHECK: util.func public @tile_matmul_alone
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
@@ -48,11 +48,11 @@
 // CHECK-SAME:         offsets = [0, 0], sizes = [%[[ARG10_W]], %[[ARG11_W]]], strides = [1, 1]
 //      CHECK:     count(%[[W0:.+]]: index, %[[W1:.+]]: index, %[[W2:.+]]: index, %[[W3:.+]]: index, %[[W4:.+]]: index, %[[W5:.+]]: index)
 //      CHECK:       %[[WX:.+]], %[[WY:.+]], %[[WZ:.+]] = flow.dispatch.workgroup_count_from_slice %[[W0]], %[[W1]], %[[W2]], %[[W3]], %[[W4]], %[[W5]]
-//      CHECK:       return %[[WX]], %[[WY]], %[[WZ]]
+//      CHECK:       flow.return %[[WX]], %[[WY]], %[[WZ]]
 
 // -----
 
-func.func @generic_op_alone(%A: tensor<?x?xf32>, %B: tensor<?xf32>) -> tensor<?x?xf32> {
+util.func public @generic_op_alone(%A: tensor<?x?xf32>, %B: tensor<?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %d0 = tensor.dim %A, %c0 : tensor<?x?xf32>
@@ -69,9 +69,9 @@
         %2 = arith.addf %arg0, %arg1 : f32
         linalg.yield %2 : f32
     } -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-//      CHECK: func.func @generic_op_alone(
+//      CHECK: util.func public @generic_op_alone(
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -97,7 +97,7 @@
 
 // -----
 
-func.func @fuse_matmul_with_fill(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @fuse_matmul_with_fill(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %zero = arith.constant 0.0 : f32
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -107,9 +107,9 @@
   %1 = linalg.fill ins(%zero : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %2 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
     outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %2 : tensor<?x?xf32>
+  util.return %2 : tensor<?x?xf32>
 }
-//       CHECK:   func.func @fuse_matmul_with_fill
+//       CHECK:   util.func public @fuse_matmul_with_fill
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 //   CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
@@ -146,7 +146,7 @@
 
 // -----
 
-func.func @keep_separate_dispatches_for_producer(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @keep_separate_dispatches_for_producer(%A : tensor<?x?xf32>, %B : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %zero = arith.constant 0.0 : f32
   %one = arith.constant 1.0 : f32
   %c0 = arith.constant 0 : index
@@ -168,9 +168,9 @@
     } -> tensor<?x?xf32>
   %4 = linalg.matmul ins(%3, %B : tensor<?x?xf32>, tensor<?x?xf32>)
     outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %4 : tensor<?x?xf32>
+  util.return %4 : tensor<?x?xf32>
 }
-//      CHECK: func.func @keep_separate_dispatches_for_producer
+//      CHECK: util.func public @keep_separate_dispatches_for_producer
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 //   CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
@@ -202,7 +202,7 @@
 
 // -----
 
-func.func @always_fuse_cast
+util.func public @always_fuse_cast
   (%lhs : tensor<?x?xf32>, %rhs1 : tensor<4x?xf32>, %rhs2 : tensor<4x?xf32>)
   -> (tensor<?x?xf32>, tensor<?x?xf32>)
 {
@@ -223,10 +223,10 @@
   %2= linalg.matmul
     ins(%0, %rhs2 : tensor<?x4xf32>, tensor<4x?xf32>)
     outs(%fill2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1, %2 : tensor<?x?xf32>, tensor<?x?xf32>
+  util.return %1, %2 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
-//      CHECK: func.func @always_fuse_cast(
+//      CHECK: util.func public @always_fuse_cast(
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<4x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<4x?xf32>
@@ -244,11 +244,11 @@
 // CHECK-SAME:     (%[[ARG0]], %[[ARG2]], %[[M]], %[[K]], %[[N2]])
 //      CHECK:     tensor.cast
 //      CHECK:     flow.return
-//      CHECK:   return %[[RESULT1]], %[[RESULT2]]
+//      CHECK:   util.return %[[RESULT1]], %[[RESULT2]]
 
 // -----
 
-func.func @dont_fuse_tensor_update_with_fill(
+util.func public @dont_fuse_tensor_update_with_fill(
     %arg0: tensor<?x?xf32>, %arg1: tensor<f32>,
     %arg2: index, %arg3: index, %arg4: index, %arg5: index)
 -> tensor<?x?xf32> {
@@ -262,26 +262,26 @@
   %5 = tensor.empty(%3, %4) : tensor<?x?xf32>
   %6 = linalg.fill ins(%0 : f32) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %7 = flow.tensor.update %arg0, %6[%arg2, %arg3] : tensor<?x?xf32>{%1, %2} -> %6 as tensor<?x?xf32>{%3, %4}
-  return %7 : tensor<?x?xf32>
+  util.return %7 : tensor<?x?xf32>
 }
 
-// CHECK: func.func @dont_fuse_tensor_update_with_fill
+// CHECK: util.func public @dont_fuse_tensor_update_with_fill
 // CHECK:   %[[SPLAT:.+]] = flow.tensor.splat
 // CHECK:   flow.tensor.update %{{.+}}, %[[SPLAT]]
 
 // -----
 
-func.func @pass_constant_through() -> tensor<2x2x3xi32> {
+util.func public @pass_constant_through() -> tensor<2x2x3xi32> {
   %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]> : tensor<2x2x3xi32>
-  return %cst : tensor<2x2x3xi32>
+  util.return %cst : tensor<2x2x3xi32>
 }
-// CHECK-LABEL: func.func @pass_constant_through()
+// CHECK-LABEL: util.func public @pass_constant_through()
 //       CHECK:   %[[CST:.+]] = arith.constant dense<{{.+}}> : tensor<2x2x3xi32>
-//       CHECK:   return %[[CST]]
+//       CHECK:   util.return %[[CST]]
 
 // -----
 
-func.func @fuse_matmul_with_generic_op(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>)
+util.func public @fuse_matmul_with_generic_op(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>)
   -> tensor<?x?xf32> {
   %f12 = arith.constant 12.0 : f32
 
@@ -295,9 +295,9 @@
 
   %D = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>)
                     outs(%CC: tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %D: tensor<?x?xf32>
+  util.return %D: tensor<?x?xf32>
 }
-// CHECK-LABEL: func.func @fuse_matmul_with_generic_op
+// CHECK-LABEL: util.func public @fuse_matmul_with_generic_op
 // linalg.generic is fused inside the dispatch region and becomes dead.
 //   CHECK-NOT: generic
 //     CHECK: flow.dispatch.workgroups
@@ -306,7 +306,7 @@
 
 // -----
 
-func.func @keep_original_producer_uses(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>)
+util.func public @keep_original_producer_uses(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>)
   -> (tensor<?x?xf32>, tensor<?x?xf32>) {
   %f12 = arith.constant 12.0 : f32
 
@@ -323,9 +323,9 @@
   %D = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>)
                     outs(%CC: tensor<?x?xf32>) -> tensor<?x?xf32>
 
-  return %D, %CC: tensor<?x?xf32>, tensor<?x?xf32>
+  util.return %D, %CC: tensor<?x?xf32>, tensor<?x?xf32>
 }
-//      CHECK: func.func @keep_original_producer_uses
+//      CHECK: util.func public @keep_original_producer_uses
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
@@ -347,11 +347,11 @@
 // CHECK-SAME:     outs(%[[STOREVAL]] : tensor<?x?xf32>)
 //  CHECK-DAG:   flow.dispatch.tensor.store %[[STOREVAL]], %[[RESULT_CAPTURE]]
 //  CHECK-DAG:   flow.dispatch.tensor.store %[[GEMM]], %[[ARG2_CAPTURE]]
-//      CHECK: return %[[origCC]]#0, %[[origCC]]#1
+//      CHECK: util.return %[[origCC]]#0, %[[origCC]]#1
 
 // -----
 
-func.func @conv2d(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>) -> tensor<1x112x112x32xf32> {
+util.func public @conv2d(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>) -> tensor<1x112x112x32xf32> {
   %0 = tensor.empty() : tensor<1x112x112x32xf32>
   %cst = arith.constant 0.000000e+00 : f32
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
@@ -360,40 +360,40 @@
          ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
          outs(%1 : tensor<1x112x112x32xf32>)
          -> tensor<1x112x112x32xf32>
-  return %2 : tensor<1x112x112x32xf32>
+  util.return %2 : tensor<1x112x112x32xf32>
 }
 
-// CHECK-LABEL: func.func @conv2d
+// CHECK-LABEL: util.func public @conv2d
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.workgroups
 //       CHECK:     linalg.conv_2d_nhwc_hwcf
 //       CHECK:     flow.return
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @depthwise_conv2d(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>) -> tensor<1x56x56x96xf32> {
+util.func public @depthwise_conv2d(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>) -> tensor<1x56x56x96xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %1 = tensor.empty() : tensor<1x56x56x96xf32>
   %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
   %4 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%input, %filter : tensor<1x113x113x96xf32>, tensor<3x3x96xf32>) outs(%2 : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
-  return %4 : tensor<1x56x56x96xf32>
+  util.return %4 : tensor<1x56x56x96xf32>
 }
 
-// CHECK-LABEL: func.func @depthwise_conv2d
+// CHECK-LABEL: util.func public @depthwise_conv2d
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.workgroups
 //       CHECK:     linalg.depthwise_conv_2d_nhwc_hwc
 //       CHECK:     flow.return
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @subtensor_insert(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
+util.func public @subtensor_insert(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
     %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index) -> tensor<?x?xf32> {
   %0 = tensor.insert_slice %arg0 into
       %arg1[%arg2, %arg3] [%arg4, %arg5] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
-//      CHECK: func.func @subtensor_insert
+//      CHECK: util.func public @subtensor_insert
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9_]+]]: index
@@ -438,11 +438,11 @@
 // CHECK-SAME:         offsets = [%[[ARG2_W]], %[[ARG3_W]]]
 // CHECK-SAME:         sizes = [%[[ARG4_W]], %[[ARG5_W]]]
 // CHECK-SAME:         !flow.dispatch.tensor<readwrite:tensor<?x?xf32>>{%[[ARG1_D0_W]], %[[ARG1_D1_W]]}
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @fuse_non_tiled_reduction_fill(%input1: tensor<1000xf32>, %input2: tensor<1000xf32>, %offset: tensor<f32>) -> tensor<f32> {
+util.func public @fuse_non_tiled_reduction_fill(%input1: tensor<1000xf32>, %input2: tensor<1000xf32>, %offset: tensor<f32>) -> tensor<f32> {
   %zero = arith.constant 0.0 : f32
   %init = tensor.empty() : tensor<f32>
   %fill = linalg.fill ins(%zero : f32) outs(%init : tensor<f32>) -> tensor<f32>
@@ -458,10 +458,10 @@
     %558 = arith.addf %557, %arg4 : f32
     linalg.yield %558 : f32
   } -> tensor<f32>
-  return %reduce : tensor<f32>
+  util.return %reduce : tensor<f32>
 }
 
-// CHECK-LABEL: func.func @fuse_non_tiled_reduction_fill
+// CHECK-LABEL: util.func public @fuse_non_tiled_reduction_fill
 
 //      CHECK: flow.dispatch.workgroups({{.+}}) : (tensor<1000xf32>, tensor<1000xf32>, tensor<f32>) -> tensor<f32> =
 // CHECK-NEXT:     (%[[INPUT1:[a-z0-9]+]]: !flow.dispatch.tensor<readonly:tensor<1000xf32>>,
@@ -481,7 +481,7 @@
 
 #map0 = affine_map<(d0, d1) -> ()>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @inline_dag_1(
+util.func public @inline_dag_1(
     %arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<i32>,
     %arg3 : index) -> tensor<1x?xf32> {
   %0 = tensor.cast %arg0 : tensor<?x?xf32> to tensor<1x?xf32>
@@ -507,9 +507,9 @@
         %14 = arith.addf %12, %13 : f32
         linalg.yield %14 : f32
       } -> tensor<1x?xf32>
-  return %9 : tensor<1x?xf32>
+  util.return %9 : tensor<1x?xf32>
 }
-// CHECK-LABEL: func.func @inline_dag_1
+// CHECK-LABEL: util.func public @inline_dag_1
 //   CHECK-NOT:   linalg.
 //   CHECK-NOT:   tensor.extract_slice
 //       CHECK:   flow.dispatch.workgroups
@@ -539,7 +539,7 @@
 
 #map0 = affine_map<(d0, d1) -> ()>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @inline_dag_2(
+util.func public @inline_dag_2(
     %arg0: tensor<?x?xf32>, %arg1 : tensor<1x?xf32>, %arg2 : tensor<i32>,
     %arg3 : index) -> tensor<1x?xf32> {
   %0 = tensor.cast %arg0 : tensor<?x?xf32> to tensor<1x?xf32>
@@ -567,9 +567,9 @@
         %14 = arith.addf %12, %13 : f32
         linalg.yield %14 : f32
       } -> tensor<1x?xf32>
-  return %9 : tensor<1x?xf32>
+  util.return %9 : tensor<1x?xf32>
 }
-// CHECK-LABEL: func.func @inline_dag_2
+// CHECK-LABEL: util.func public @inline_dag_2
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<1x?xf32>
 //       CHECK:   flow.dispatch.workgroups
@@ -595,7 +595,7 @@
 
 // -----
 
-func.func @inline_dag_3(%240 : tensor<9xi32>, %244 : tensor<18xi32>, %247 : tensor<i32>) -> tensor<9xi1> {
+util.func public @inline_dag_3(%240 : tensor<9xi32>, %244 : tensor<18xi32>, %247 : tensor<i32>) -> tensor<9xi1> {
   %c9 = arith.constant 9 : index
   %c5_i32 = arith.constant 5 : i32
   %c0_i32 = arith.constant 0 : i32
@@ -617,9 +617,9 @@
           %849 = arith.cmpi eq, %arg20, %c5_i32 : i32
           linalg.yield %849 : i1
       } -> tensor<9xi1>
-  return %256 : tensor<9xi1>
+  util.return %256 : tensor<9xi1>
 }
-//       CHECK: func.func @inline_dag_3
+//       CHECK: util.func public @inline_dag_3
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<9xi32>
 //  CHECK-SAME:   %[[ARG1:.+]]: tensor<18xi32>
 //  CHECK-SAME:   %[[ARG2:.+]]: tensor<i32>
@@ -648,7 +648,7 @@
 // -----
 
 #map = affine_map<() -> ()>
-func.func @inline_dag_4(%arg0: tensor<4xi32>, %arg1: tensor<i32>) -> tensor<i16> {
+util.func public @inline_dag_4(%arg0: tensor<4xi32>, %arg1: tensor<i32>) -> tensor<i16> {
   %c3_i32 = arith.constant 3 : i32
   %c0_i32 = arith.constant 0 : i32
   %0 = tensor.extract %arg1[] : tensor<i32>
@@ -666,9 +666,9 @@
     %9 = arith.trunci %arg2 : i32 to i16
     linalg.yield %9 : i16
   } -> tensor<i16>
-  return %8 : tensor<i16>
+  util.return %8 : tensor<i16>
 }
-// CHECK-LABEL: func.func @inline_dag_4
+// CHECK-LABEL: util.func public @inline_dag_4
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<4xi32>
 //  CHECK-SAME:   %[[ARG1:.+]]: tensor<i32>
 //       CHECK:   flow.dispatch.workgroups
@@ -698,7 +698,7 @@
 
 // -----
 
-func.func @multi_result(%arg0: tensor<?x?xi32>, %arg1: tensor<?x?xi32>) -> (tensor<?xi32>, tensor<?xi32>) {
+util.func public @multi_result(%arg0: tensor<?x?xi32>, %arg1: tensor<?x?xi32>) -> (tensor<?xi32>, tensor<?xi32>) {
   %cmin = arith.constant -2147483648 : i32
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
@@ -724,22 +724,22 @@
     %11 = arith.select %7, %9, %10 : i32
     linalg.yield %6, %11 : i32, i32
   } -> (tensor<?xi32>, tensor<?xi32>)
-  return %4#0, %4#1 : tensor<?xi32>, tensor<?xi32>
+  util.return %4#0, %4#1 : tensor<?xi32>, tensor<?xi32>
 }
-// CHECK-LABEL: func.func @multi_result
+// CHECK-LABEL: util.func public @multi_result
 //       CHECK:   %[[RESULT_OUT:.+]]:2 = flow.dispatch.workgroups
 //  CHECK-NEXT:     %[[ARG5:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<writeonly:tensor<?xi32>>
 //  CHECK-SAME:     %[[ARG6:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<writeonly:tensor<?xi32>>
 //       CHECK:     %[[RESULT:.+]]:2 = linalg.generic
 //   CHECK-DAG:     flow.dispatch.tensor.store %[[RESULT]]#0, %[[ARG5]]
 //   CHECK-DAG:     flow.dispatch.tensor.store %[[RESULT]]#1, %[[ARG6]]
-//       CHECK:   return %[[RESULT_OUT]]#0, %[[RESULT_OUT]]#1
+//       CHECK:   util.return %[[RESULT_OUT]]#0, %[[RESULT_OUT]]#1
 
 // -----
 
 // TODO: Maybe this test is now not needed anymore.
 
-func.func @dynamic_slice(%arg0: tensor<?x?xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3 : index) -> tensor<1x?xi32> {
+util.func public @dynamic_slice(%arg0: tensor<?x?xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3 : index) -> tensor<1x?xi32> {
   %c1_i32 = arith.constant 1 : i32
   %c0_i32 = arith.constant 0 : i32
   %0 = tensor.extract %arg1[] : tensor<i32>
@@ -755,9 +755,9 @@
   %10 = arith.select %9, %8, %c0_i32 : i32
   %11 = arith.index_cast %10 : i32 to index
   %12 = tensor.extract_slice %arg0[%5, %11] [1, %arg3] [1, 1] : tensor<?x?xi32> to tensor<1x?xi32>
-  return %12 : tensor<1x?xi32>
+  util.return %12 : tensor<1x?xi32>
 }
-// CHECK-LABEL: func.func @dynamic_slice(
+// CHECK-LABEL: util.func public @dynamic_slice(
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<?x?xi32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<i32>
 //  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<i32>
@@ -786,11 +786,11 @@
 //       CHECK:     flow.dispatch.tensor.load %[[ARG0_CAPTURE]]
 //       CHECK:     flow.dispatch.tensor.store %{{.*}}, %[[DEST_CAPTURE]]
 //       CHECK:     flow.return
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @dynamic_dot() -> tensor<?x?xf32> {
+util.func public @dynamic_dot() -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %cst = arith.constant 0.000000e+00 : f32
@@ -801,9 +801,9 @@
   %4 = tensor.empty(%2, %3) : tensor<?x?xf32>
   %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %6 = linalg.matmul ins(%0, %1 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%5 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %6 : tensor<?x?xf32>
+  util.return %6 : tensor<?x?xf32>
 }
-// CHECK-LABEL: func.func @dynamic_dot()
+// CHECK-LABEL: util.func public @dynamic_dot()
 //   CHECK-NOT:    linalg.fill
 //   CHECK-NOT:    linalg.matmul
 //       CHECK:    flow.dispatch.workgroups
@@ -812,11 +812,11 @@
 //       CHECK:      flow.return
 //   CHECK-NOT:    linalg.fill
 //   CHECK-NOT:    linalg.matmul
-//       CHECK:    return
+//       CHECK:    util.return
 
 // -----
 
-func.func @scatter(
+util.func public @scatter(
     %original : tensor<?x?xf32>, %indices : tensor<?x1xi32>,
     %update : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = iree_linalg_ext.scatter
@@ -828,9 +828,9 @@
         %1 = arith.addf %arg0, %arg1 : f32
         iree_linalg_ext.yield %1 : f32
   } -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
-//      CHECK: func.func @scatter(
+//      CHECK: util.func public @scatter(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x1xi32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
@@ -855,11 +855,11 @@
 // CHECK-SAME:               ins(%[[UPDATE]], %[[INDICES]] : tensor<?x?xf32>, tensor<?x1xi32>)
 // CHECK-SAME:               outs(%[[ORIGINAL]] : tensor<?x?xf32>)
 //      CHECK:       flow.dispatch.tensor.store %[[SCATTER]], %[[ARG0_CAPTURE]]
-//      CHECK:   return %[[RESULT]] : tensor<?x?xf32>
+//      CHECK:   util.return %[[RESULT]] : tensor<?x?xf32>
 
 // -----
 
-func.func @sort_3d(%arg0: tensor<?x?x?xi32>, %arg1 : tensor<?x?x?xf32>)
+util.func public @sort_3d(%arg0: tensor<?x?x?xi32>, %arg1 : tensor<?x?x?xf32>)
     -> (tensor<?x?x?xi32>, tensor<?x?x?xf32>) {
   %0, %1 = iree_linalg_ext.sort dimension(0)
       outs(%arg0, %arg1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>) {
@@ -867,9 +867,9 @@
         %2 = arith.cmpf ogt, %arg4, %arg5 : f32
         iree_linalg_ext.yield %2 : i1
       } -> tensor<?x?x?xi32>, tensor<?x?x?xf32>
-  return %0, %1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>
+  util.return %0, %1 : tensor<?x?x?xi32>, tensor<?x?x?xf32>
 }
-//      CHECK: func.func @sort_3d(
+//      CHECK: util.func public @sort_3d(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xi32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -906,11 +906,11 @@
 // CHECK-SAME:           offsets = [0, 0, 0], sizes = [%[[ARG1_D0_W]], %[[ARG1_D1_W]], %[[ARG1_D2_W]]]
 //      CHECK:     flow.return
 //      CHECK:   }
-//      CHECK:   return %[[RESULT_OUT]]#0, %[[RESULT_OUT]]#1
+//      CHECK:   util.return %[[RESULT_OUT]]#0, %[[RESULT_OUT]]#1
 
 // -----
 
-func.func @scatter_static(%arg0 : tensor<4xi32>, %arg1 : tensor<4x1xi32>, %arg2 : tensor<8xi32>)
+util.func public @scatter_static(%arg0 : tensor<4xi32>, %arg1 : tensor<4x1xi32>, %arg2 : tensor<8xi32>)
     -> tensor<8xi32>{
   %cst = arith.constant dense<[0, 9, 0, 10, 11, 0, 0, 12]> : tensor<8xi32>
   %cst_0 = arith.constant dense<[9, 10, 11, 12]> : tensor<4xi32>
@@ -924,9 +924,9 @@
     ^bb0(%arg3: i32, %arg4: i32):  // no predecessors
       iree_linalg_ext.yield %arg3 : i32
     } -> tensor<8xi32>
-  return %0 : tensor<8xi32>
+  util.return %0 : tensor<8xi32>
 }
-//      CHECK: func.func @scatter_static
+//      CHECK: util.func public @scatter_static
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<4xi32>
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<4x1xi32>
 // CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<8xi32>
@@ -936,22 +936,22 @@
 // CHECK-SAME:     %[[ARG5:[a-zA-Z0-9_]+]]: !flow.dispatch.tensor<readwrite:tensor<8xi32>>
 //      CHECK:     %[[SCATTER_TILE:.+]] = iree_linalg_ext.scatter
 //      CHECK:     flow.dispatch.tensor.store %[[SCATTER_TILE]], %[[ARG5]], offsets = [0], sizes = [8], strides = [1]
-//      CHECK:  return %[[RESULT]]
+//      CHECK:  util.return %[[RESULT]]
 
 // -----
 
 // Check that we are distributing along the last three dimensions for NHWC-output pooling op.
 
-func.func @pooling_nwhc_sum_static(%input: tensor<1x33x33x160xf32>) -> tensor<1x3x3x160xf32> {
+util.func public @pooling_nwhc_sum_static(%input: tensor<1x33x33x160xf32>) -> tensor<1x3x3x160xf32> {
   %cst = arith.constant 0.0 : f32
   %1 = tensor.empty() : tensor<1x3x3x160xf32>
   %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x3x3x160xf32>) -> tensor<1x3x3x160xf32>
   %3 = tensor.empty() : tensor<11x11xf32>
   %4 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<11> : vector<2xi64>} ins(%input, %3 : tensor<1x33x33x160xf32>, tensor<11x11xf32>) outs(%2 : tensor<1x3x3x160xf32>) -> tensor<1x3x3x160xf32>
-  return %4 : tensor<1x3x3x160xf32>
+  util.return %4 : tensor<1x3x3x160xf32>
 }
 
-// CHECK-LABEL: func.func @pooling_nwhc_sum_static
+// CHECK-LABEL: util.func public @pooling_nwhc_sum_static
 //       CHECK:   %[[DISPATCH:.+]] = flow.dispatch.workgroups(
 //   CHECK-DAG:     %[[INPUT:.+]] = flow.dispatch.tensor.load
 //   CHECK-DAG:     %[[EMPTY0:.+]] = tensor.empty() : tensor<1x3x3x160xf32>
@@ -963,11 +963,11 @@
 //  CHECK-SAME:         outs(%[[FILL]] :
 //       CHECK:     flow.dispatch.tensor.store %[[POOL]]
 //       CHECK:     flow.return
-//       CHECK:   return %[[DISPATCH]]
+//       CHECK:   util.return %[[DISPATCH]]
 
 // -----
 
-func.func @named_op_outs_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @named_op_outs_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %cst1 = arith.constant -1.0 : f64
@@ -980,9 +980,9 @@
       outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
   %matmul = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %matmul : tensor<?x?xf32>
+  util.return %matmul : tensor<?x?xf32>
 }
-// CHECK-LABEL: func.func @named_op_outs_fusion
+// CHECK-LABEL: util.func public @named_op_outs_fusion
 //       CHECK:   flow.dispatch.workgroups
 //       CHECK:     %[[FILL:.+]] = linalg.fill_rng_2d
 //       CHECK:     linalg.matmul
@@ -990,7 +990,7 @@
 
 // -----
 
-func.func @dynamic_slice(%arg0 : i32, %arg1 : i32, %arg2 : tensor<?xi32>,
+util.func public @dynamic_slice(%arg0 : i32, %arg1 : i32, %arg2 : tensor<?xi32>,
     %arg3 : tensor<?x?xi32>) -> tensor<?x?xi32>{
   %c0 = arith.constant 0 : index
   %c0_i32 = arith.constant 0 : i32
@@ -1008,9 +1008,9 @@
   %d0 = tensor.dim %arg2, %c0 : tensor<?xi32>
   %17 = tensor.insert_slice %arg2 into
       %arg3[%9, %15] [1, %d0] [1, 1] : tensor<?xi32> into tensor<?x?xi32>
-  return %17 : tensor<?x?xi32>
+  util.return %17 : tensor<?x?xi32>
 }
-// CHECK-LABEL: func.func @dynamic_slice
+// CHECK-LABEL: util.func public @dynamic_slice
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: i32
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: i32
 //  CHECK-SAME:     %[[ARG2:.+]]: tensor<?xi32>
@@ -1025,13 +1025,13 @@
 
 // -----
 
-func.func @extract_slice(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index,
+util.func public @extract_slice(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index,
     %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) -> tensor<?x?xf32> {
   %0 = tensor.extract_slice %arg0[%arg1, %arg2] [%arg3, %arg4] [%arg5, %arg6] :
       tensor<?x?xf32> to tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
-//      CHECK: func.func @extract_slice
+//      CHECK: util.func public @extract_slice
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
@@ -1070,18 +1070,18 @@
 // -----
 
 // TODO(ravishankarm): Enable after upstream pad op tiling issues are addressed.
-// func.func @tensor.pad(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index,
+// util.func public @tensor.pad(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index,
 //     %arg3 : index, %arg4 : index, %arg5 : f32) -> tensor<?x?xf32> {
 //   %0 = tensor.pad %arg0 low[%arg1, %arg2] high[%arg3, %arg4] {
 //     ^bb0(%arg6 : index, %arg7 : index):
 //       tensor.yield %arg5 : f32
 //   } :  tensor<?x?xf32> to tensor<?x?xf32>
-//   return %0 : tensor<?x?xf32>
+//   util.return %0 : tensor<?x?xf32>
 // }
 
 // -----
 
-func.func @inline_cst(%arg0 : tensor<4x32xi32>) -> tensor<32xi32> {
+util.func public @inline_cst(%arg0 : tensor<4x32xi32>) -> tensor<32xi32> {
   %cst = arith.constant dense<0> : tensor<32xi32>
   %0 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>],
@@ -1091,16 +1091,16 @@
         %1 = arith.addi %arg1, %arg2 : i32
         linalg.yield %1 : i32
       } -> tensor<32xi32>
-  return %0 : tensor<32xi32>
+  util.return %0 : tensor<32xi32>
 }
-//      CHECK: func.func @inline_cst(%[[ARG0:.+]]: tensor<4x32xi32>)
+//      CHECK: util.func public @inline_cst(%[[ARG0:.+]]: tensor<4x32xi32>)
 //      CHECK:   flow.dispatch.workgroups
 // CHECK-SAME:     (%[[ARG0]])
 //      CHECK:     %[[CST:.+]] = arith.constant dense<0> : tensor<32xi32>
 
 // -----
 
-func.func @inline_cst2(%arg0 : tensor<4x2xi32>) -> tensor<2xi32> {
+util.func public @inline_cst2(%arg0 : tensor<4x2xi32>) -> tensor<2xi32> {
   %cst = arith.constant dense<[21, 42]> : tensor<2xi32>
   %0 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>],
@@ -1110,9 +1110,9 @@
         %1 = arith.addi %arg1, %arg2 : i32
         linalg.yield %1 : i32
       } -> tensor<2xi32>
-  return %0 : tensor<2xi32>
+  util.return %0 : tensor<2xi32>
 }
-// CHECK-LABEL: func.func @inline_cst2(
+// CHECK-LABEL: util.func public @inline_cst2(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<4x2xi32>)
 //       CHECK:   flow.dispatch.workgroups
 //  CHECK-SAME:     (%[[ARG0]])
@@ -1120,14 +1120,14 @@
 
 // -----
 
-func.func @gemm_unitN(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x1xf32>,
+util.func public @gemm_unitN(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x1xf32>,
     %arg2 : tensor<?x1xf32>) -> tensor<?x1xf32> {
   %0 = linalg.matmul
       ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x1xf32>)
       outs(%arg2 : tensor<?x1xf32>) -> tensor<?x1xf32>
-  return %0 : tensor<?x1xf32>
+  util.return %0 : tensor<?x1xf32>
 }
-// CHECK-LABEL: func.func @gemm_unitN(
+// CHECK-LABEL: util.func public @gemm_unitN(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>,
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x1xf32>,
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x1xf32>)
@@ -1141,27 +1141,27 @@
 
 // -----
 
-func.func @gemm_unitM_unitN(%arg0 : tensor<1x1xf32>, %arg1 : tensor<1x1xf32>,
+util.func public @gemm_unitM_unitN(%arg0 : tensor<1x1xf32>, %arg1 : tensor<1x1xf32>,
     %arg2 : tensor<1x1xf32>) -> tensor<1x1xf32> {
   %0 = linalg.matmul
       ins(%arg0, %arg1 : tensor<1x1xf32>, tensor<1x1xf32>)
       outs(%arg2 : tensor<1x1xf32>) -> tensor<1x1xf32>
-  return %0 : tensor<1x1xf32>
+  util.return %0 : tensor<1x1xf32>
 }
-// CHECK-LABEL: func.func @gemm_unitM_unitN(
+// CHECK-LABEL: util.func public @gemm_unitM_unitN(
 //       CHECK:   flow.dispatch.workgroups(
 //       CHECK:     linalg.matmul
 
 // -----
 
-func.func @gemm_unitM(%arg0 : tensor<1x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @gemm_unitM(%arg0 : tensor<1x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : tensor<1x?xf32>) -> tensor<1x?xf32> {
   %0 = linalg.matmul
       ins(%arg0, %arg1 : tensor<1x?xf32>, tensor<?x?xf32>)
       outs(%arg2 : tensor<1x?xf32>) -> tensor<1x?xf32>
-  return %0 : tensor<1x?xf32>
+  util.return %0 : tensor<1x?xf32>
 }
-// CHECK-LABEL: func.func @gemm_unitM(
+// CHECK-LABEL: util.func public @gemm_unitM(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<1x?xf32>,
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>,
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<1x?xf32>)
@@ -1176,7 +1176,7 @@
 // -----
 
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)>
-func.func @unit_dim_generic(%arg0 : tensor<1x?x1x1x?x?x1x?xf32>,
+util.func public @unit_dim_generic(%arg0 : tensor<1x?x1x1x?x?x1x?xf32>,
     %arg1 : tensor<1x?x1x1x?x?x1x?xf32>) -> tensor<1x?x1x1x?x?x1x?xf32> {
   %0 = linalg.generic {
       indexing_maps = [#map, #map, #map],
@@ -1187,9 +1187,9 @@
         %1 = arith.addf %arg2, %arg3 : f32
         linalg.yield %1 : f32
       } -> tensor<1x?x1x1x?x?x1x?xf32>
-  return %0 : tensor<1x?x1x1x?x?x1x?xf32>
+  util.return %0 : tensor<1x?x1x1x?x?x1x?xf32>
 }
-//      CHECK: func.func @unit_dim_generic(
+//      CHECK: util.func public @unit_dim_generic(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<1x?x1x1x?x?x1x?xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<1x?x1x1x?x?x1x?xf32>)
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
@@ -1209,7 +1209,7 @@
 
 // -----
 
-func.func @dont_fuse_tensor_insert_dest_producer(%arg0 : tensor<2x2xf32>) -> tensor<3x3xf32> {
+util.func public @dont_fuse_tensor_insert_dest_producer(%arg0 : tensor<2x2xf32>) -> tensor<3x3xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %cst = arith.constant dense<0.0> : tensor<3x3xf32>
@@ -1224,9 +1224,9 @@
     } -> tensor<2x2xf32>
   %1 = tensor.insert_slice %0 into %cst[0, 0] [2, 2] [1, 1]
       : tensor<2x2xf32> into tensor<3x3xf32>
-  return %1 : tensor<3x3xf32>
+  util.return %1 : tensor<3x3xf32>
 }
-//      CHECK: func.func @dont_fuse_tensor_insert_dest_producer
+//      CHECK: util.func public @dont_fuse_tensor_insert_dest_producer
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<2x2xf32>
 //      CHECK:   %[[CST:.+]] = arith.constant {{.+}} : tensor<3x3xf32>
 //      CHECK:   %[[DISPATCH1:.+]] = flow.dispatch.workgroups
@@ -1234,26 +1234,26 @@
 //      CHECK:       flow.return
 //      CHECK:   %[[DISPATCH2:.+]] = flow.dispatch.workgroups
 // CHECK-SAME:       (%[[DISPATCH1]], %[[CST]])
-//      CHECK:   return %[[DISPATCH2]]
+//      CHECK:   util.return %[[DISPATCH2]]
 
 // -----
 
-func.func @fill_op_alone(%arg0 : index, %arg1 : index) -> tensor<?x?xf32> {
+util.func public @fill_op_alone(%arg0 : index, %arg1 : index) -> tensor<?x?xf32> {
   %cst = arith.constant 42.0 : f32
   %0 = tensor.empty(%arg0, %arg1) : tensor<?x?xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-//      CHECK: func.func @fill_op_alone(
+//      CHECK: util.func public @fill_op_alone(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
 //      CHECK:   %[[SPLAT:.+]] = flow.tensor.splat %[[CST]] : tensor<?x?xf32>{%arg0, %arg1}
-//      CHECK:   return %[[SPLAT]]
+//      CHECK:   util.return %[[SPLAT]]
 
 // -----
 
 // Reshapes cannot be fused until #8637 is fixed.
-func.func @dont_fuse_reshape(%lhs : tensor<?xf32>, %rhs1 : tensor<4x?xf32>, %rhs2 : tensor<4x?xf32>)
+util.func public @dont_fuse_reshape(%lhs : tensor<?xf32>, %rhs1 : tensor<4x?xf32>, %rhs2 : tensor<4x?xf32>)
   -> (tensor<?x?xf32>, tensor<?x?xf32>)
 {
   %cst = arith.constant 0.0 : f32
@@ -1273,9 +1273,9 @@
   %2= linalg.matmul
     ins(%0, %rhs2 : tensor<?x4xf32>, tensor<4x?xf32>)
     outs(%fill2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1, %2 : tensor<?x?xf32>, tensor<?x?xf32>
+  util.return %1, %2 : tensor<?x?xf32>, tensor<?x?xf32>
 }
-//      CHECK: func.func @dont_fuse_reshape(
+//      CHECK: util.func public @dont_fuse_reshape(
 // CHECK-SAME:     %[[LHS:.+]]: tensor<?xf32>
 //  CHECK-DAG:   %[[RESHAPE:.+]] = flow.tensor.reshape %[[LHS]]
 //      CHECK:   %[[DISPATCH1:.+]] = flow.dispatch.workgroups
@@ -1290,39 +1290,39 @@
 //      CHECK:     linalg.fill
 //      CHECK:     linalg.matmul
 //      CHECK:     flow.return
-//      CHECK:   return %[[DISPATCH1]], %[[DISPATCH2]]
+//      CHECK:   util.return %[[DISPATCH1]], %[[DISPATCH2]]
 
 // -----
 
 // TODO: Maybe this test is now not needed anymore.
 
-func.func @concat_pattern(%src1 : tensor<2x40xf32>, %src2 : tensor<3x40xf32>,
+util.func public @concat_pattern(%src1 : tensor<2x40xf32>, %src2 : tensor<3x40xf32>,
     %dest : tensor<5x40xf32>) -> tensor<5x40xf32> {
   %0 = tensor.insert_slice %src1 into %dest[0, 0] [2, 40] [1, 1]
       : tensor<2x40xf32> into tensor<5x40xf32>
   %1 = tensor.insert_slice %src2 into %0[2, 0] [3, 40] [1, 1]
       : tensor<3x40xf32> into tensor<5x40xf32>
-  return %1 : tensor<5x40xf32>
+  util.return %1 : tensor<5x40xf32>
 }
-//      CHECK: func.func @concat_pattern
+//      CHECK: util.func public @concat_pattern
 // CHECK-SAME:     %[[SRC1:.+]]: tensor<2x40xf32>
 // CHECK-SAME:     %[[SRC2:.+]]: tensor<3x40xf32>
 // CHECK-SAME:     %[[DEST:.+]]: tensor<5x40xf32>
 //      CHECK:   %[[UPDATE1:.+]] = flow.tensor.update %[[SRC1]], %[[DEST]]
 //      CHECK:   %[[UPDATE2:.+]] = flow.tensor.update %[[SRC2]], %[[UPDATE1]]
-//      CHECK:   return %[[UPDATE2]]
+//      CHECK:   util.return %[[UPDATE2]]
 
 // -----
 
-func.func @generic_tensor_insert(%arg0 : tensor<?x?xf32>,
+util.func public @generic_tensor_insert(%arg0 : tensor<?x?xf32>,
     %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index,
     %arg5 : index, %arg6 : index, %arg7 : index, %arg8 : index,
     %arg9 : index, %arg10 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = tensor.extract_slice %arg0[%arg1, %arg2] [1, %arg3] [%arg4, %arg5] : tensor<?x?xf32> to tensor<?xf32>
   %1 = tensor.insert_slice %0 into %arg10[%arg6, %arg7] [%arg3, 1] [%arg8, %arg9] : tensor<?xf32> into tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-//      CHECK: func.func @generic_tensor_insert(
+//      CHECK: util.func public @generic_tensor_insert(
 // CHECK-SAME:     %[[SOURCE:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 // CHECK-SAME:     %[[SOURCE_OFFSET_Y:[a-zA-Z0-9]+]]: index
 // CHECK-SAME:     %[[SOURCE_OFFSET_X:[a-zA-Z0-9]+]]: index
@@ -1390,7 +1390,7 @@
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d1)>
-func.func @multi_use_producer_fusion(%arg0 : tensor<?x8xf32>, %arg1 : tensor<8x?xf32>,
+util.func public @multi_use_producer_fusion(%arg0 : tensor<?x8xf32>, %arg1 : tensor<8x?xf32>,
     %arg2 : tensor<?xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -1409,9 +1409,9 @@
       %0 = arith.addf %b0, %b1 : f32
       linalg.yield %0 : f32
     } -> tensor<?x?xf32>
-  return %matmul, %generic : tensor<?x?xf32>, tensor<?x?xf32>
+  util.return %matmul, %generic : tensor<?x?xf32>, tensor<?x?xf32>
 }
-//      CHECK: func @multi_use_producer_fusion
+//      CHECK: util.func public @multi_use_producer_fusion
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x8xf32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<8x?xf32>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?xf32>
@@ -1447,20 +1447,20 @@
 // CHECK-SAME:         outs(%[[INIT]] :
 //  CHECK-DAG:     flow.dispatch.tensor.store %[[GENERIC]], %[[RESULT0]]
 //  CHECK-DAG:     flow.dispatch.tensor.store %[[MATMUL]], %[[RESULT1]]
-//      CHECK:   return %[[DISPATCH]]#1, %[[DISPATCH]]#0
+//      CHECK:   util.return %[[DISPATCH]]#1, %[[DISPATCH]]#0
 
 // -----
 
-func.func @fft_cst_output(%arg0 : tensor<3x2190x1x512xf32>) -> (tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>) {
+util.func public @fft_cst_output(%arg0 : tensor<3x2190x1x512xf32>) -> (tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>) {
   %c1 = arith.constant 1 : index
   %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
   %cst_0 = arith.constant dense<-0.000000e+00> : tensor<1xf32>
   %cst_1 = arith.constant dense<0.000000e+00> : tensor<3x2190x1x512xf32>
   %0:2 = iree_linalg_ext.fft ins(%c1, %cst, %cst_0 : index, tensor<1xf32>, tensor<1xf32>)
       outs(%arg0, %cst_1 : tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>) : tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>
-  return %0#0, %0#1 : tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>
+  util.return %0#0, %0#1 : tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>
 }
-//      CHECK: func @fft_cst_output
+//      CHECK: util.func public @fft_cst_output
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<3x2190x1x512xf32>
 //      CHECK:   %[[DISPATCH:.+]] = flow.dispatch.workgroups
 // CHECK-SAME:       (%[[ARG0]]) : (tensor<3x2190x1x512xf32>) -> (%[[ARG0]], tensor<3x2190x1x512xf32>)
@@ -1475,7 +1475,7 @@
 // -----
 
 
-func.func @fuse_conv2d_elementwise(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
+util.func public @fuse_conv2d_elementwise(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x112x112x32xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
@@ -1496,7 +1496,7 @@
             %sub = arith.subf %a, %b : f32
             linalg.yield %sub : f32
          } -> tensor<1x112x112x32xf32>
-  return %3 : tensor<1x112x112x32xf32>
+  util.return %3 : tensor<1x112x112x32xf32>
 }
 
 // Check that
@@ -1504,7 +1504,7 @@
 // * linalg.generic's linalg.fill is pulled into the same group;
 // * linalg.conv's linalg.fill is pulled into the same group.
 
-// CHECK-LABEL: func.func @fuse_conv2d_elementwise
+// CHECK-LABEL: util.func public @fuse_conv2d_elementwise
 
 //      CHECK: flow.dispatch.workgroups
 //      CHECK:   %[[INIT:.+]] = tensor.empty
@@ -1518,7 +1518,7 @@
 
 // -----
 
-func.func @fuse_conv2d_with_multiple_uses(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>)
+util.func public @fuse_conv2d_with_multiple_uses(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>)
   -> (tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x112x112x32xf32>
@@ -1540,10 +1540,10 @@
             %sub = arith.subf %a, %b : f32
             linalg.yield %sub : f32
          } -> tensor<1x112x112x32xf32>
-  return %3, %2 : tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>
+  util.return %3, %2 : tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>
 }
 
-// CHECK-LABEL: func.func @fuse_conv2d_with_multiple_uses
+// CHECK-LABEL: util.func public @fuse_conv2d_with_multiple_uses
 //       CHECK:   %[[DISPATCH:.+]]:2 = flow.dispatch.workgroups
 //  CHECK-NEXT:       %[[OUT1:[a-zA-Z0-9]+]]: !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
 //  CHECK-SAME:       %[[OUT2:.+]]: !flow.dispatch.tensor<writeonly:tensor<1x112x112x32xf32>>
@@ -1551,11 +1551,11 @@
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic
 //   CHECK-DAG:     flow.dispatch.tensor.store %[[GENERIC]], %[[OUT1]]
 //   CHECK-DAG:     flow.dispatch.tensor.store %[[CONV]], %[[OUT2]]
-//       CHECK:   return %[[DISPATCH]]#0, %[[DISPATCH]]#1
+//       CHECK:   util.return %[[DISPATCH]]#0, %[[DISPATCH]]#1
 
 // -----
 
-func.func @dont_fuse_conv2d_with_non_identity_map(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
+util.func public @dont_fuse_conv2d_with_non_identity_map(%input: tensor<1x225x225x16xf32>, %filter: tensor<3x3x16x32xf32>, %offset: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x112x112x32xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
@@ -1576,10 +1576,10 @@
             %sub = arith.subf %a, %b : f32
             linalg.yield %sub : f32
          } -> tensor<1x112x112x32xf32>
-  return %3 : tensor<1x112x112x32xf32>
+  util.return %3 : tensor<1x112x112x32xf32>
 }
 
-// CHECK-LABEL: func.func @dont_fuse_conv2d_with_non_identity_map
+// CHECK-LABEL: util.func public @dont_fuse_conv2d_with_non_identity_map
 
 // CHECK: flow.dispatch.workgroups
 // CHECK:   linalg.conv_2d_nhwc_hwcf
@@ -1592,7 +1592,7 @@
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 
-func.func @reduction_broadcast_elementwise_unary(%a: tensor<12x16x16xf32>, %b: tensor<12x16x16xf32>) -> tensor<12x16x16xf32> {
+util.func public @reduction_broadcast_elementwise_unary(%a: tensor<12x16x16xf32>, %b: tensor<12x16x16xf32>) -> tensor<12x16x16xf32> {
   %cst_47 = arith.constant 0.000000e+00 : f32
   %37 = tensor.empty() : tensor<12x16xf32>
   %38 = linalg.fill ins(%cst_47 : f32) outs(%37 : tensor<12x16xf32>) -> tensor<12x16xf32>
@@ -1607,14 +1607,14 @@
     %780 = arith.subf %arg3, %arg4 : f32
     linalg.yield %780 : f32
   } -> tensor<12x16x16xf32>
-  return %42 : tensor<12x16x16xf32>
+  util.return %42 : tensor<12x16x16xf32>
 }
 
 // There is only one input to the reduction.
 // Check that two generic ops are dispatched together.
 // The first generic (reduction) is directly used by the second generic (elementwise).
 
-// CHECK-LABEL: func.func @reduction_broadcast_elementwise_unary
+// CHECK-LABEL: util.func public @reduction_broadcast_elementwise_unary
 //      CHECK: flow.dispatch.workgroups
 //      CHECK:   %[[RED:.+]] = linalg.generic
 //      CHECK:   linalg.generic
@@ -1625,7 +1625,7 @@
 #map1 = affine_map<(d0, d1) -> (d0)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 
-func.func @reduction_broadcast_elementwise_binary1(%a1: tensor<128x384xf32>, %a2: tensor<128xf32>, %b: tensor<128x384xf32>) -> tensor<128x384xf32> {
+util.func public @reduction_broadcast_elementwise_binary1(%a1: tensor<128x384xf32>, %a2: tensor<128xf32>, %b: tensor<128x384xf32>) -> tensor<128x384xf32> {
   %cst_47 = arith.constant 0.000000e+00 : f32
   %37 = tensor.empty() : tensor<128xf32>
   %38 = linalg.fill ins(%cst_47 : f32) outs(%37 : tensor<128xf32>) -> tensor<128xf32>
@@ -1642,14 +1642,14 @@
     %780 = arith.subf %arg3, %arg4 : f32
     linalg.yield %780 : f32
   } -> tensor<128x384xf32>
-  return %42 : tensor<128x384xf32>
+  util.return %42 : tensor<128x384xf32>
 }
 
 // There are two inputs to the reduction and one of them is broadcasted.
 // Check that two generic ops are dispatched together.
 // The first generic (reduction) is directly used by the second generic (elementwise).
 
-// CHECK-LABEL: func.func @reduction_broadcast_elementwise_binary1
+// CHECK-LABEL: util.func public @reduction_broadcast_elementwise_binary1
 //      CHECK: flow.dispatch.workgroups
 //      CHECK:   %[[RED:.+]] = linalg.generic
 //      CHECK:   linalg.generic
@@ -1661,7 +1661,7 @@
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1) -> (d1)>
 
-func.func @reduction_broadcast_elementwise_binary2(%a1: tensor<128x384xf32>, %a2: tensor<384xf32>, %b: tensor<128x384xf32>) -> tensor<128x384xf32> {
+util.func public @reduction_broadcast_elementwise_binary2(%a1: tensor<128x384xf32>, %a2: tensor<384xf32>, %b: tensor<128x384xf32>) -> tensor<128x384xf32> {
   %cst_47 = arith.constant 0.000000e+00 : f32
   %37 = tensor.empty() : tensor<128xf32>
   %38 = linalg.fill ins(%cst_47 : f32) outs(%37 : tensor<128xf32>) -> tensor<128xf32>
@@ -1678,14 +1678,14 @@
     %780 = arith.subf %arg3, %arg4 : f32
     linalg.yield %780 : f32
   } -> tensor<128x384xf32>
-  return %42 : tensor<128x384xf32>
+  util.return %42 : tensor<128x384xf32>
 }
 
 // There are two inputs to the reduction and one of them is broadcasted.
 // Check that two generic ops are dispatched together.
 // The first generic (reduction) is directly used by the second generic (elementwise).
 
-// CHECK-LABEL: func.func @reduction_broadcast_elementwise_binary2
+// CHECK-LABEL: util.func public @reduction_broadcast_elementwise_binary2
 //      CHECK: flow.dispatch.workgroups
 //      CHECK:   %[[RED:.+]] = linalg.generic
 //      CHECK:   linalg.generic
@@ -1696,7 +1696,7 @@
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 
-func.func @reduction_broadcast_elementwise_dynamic(%a: tensor<12x16x?xf32>, %b: tensor<12x16x?xf32>) -> tensor<12x16x?xf32> {
+util.func public @reduction_broadcast_elementwise_dynamic(%a: tensor<12x16x?xf32>, %b: tensor<12x16x?xf32>) -> tensor<12x16x?xf32> {
   %cst_47 = arith.constant 0.000000e+00 : f32
   %37 = tensor.empty() : tensor<12x16xf32>
   %38 = linalg.fill ins(%cst_47 : f32) outs(%37 : tensor<12x16xf32>) -> tensor<12x16xf32>
@@ -1713,12 +1713,12 @@
     %780 = arith.subf %arg3, %arg4 : f32
     linalg.yield %780 : f32
   } -> tensor<12x16x?xf32>
-  return %42 : tensor<12x16x?xf32>
+  util.return %42 : tensor<12x16x?xf32>
 }
 
 // Dynamic shape case is not supported yet by the Vulkan codegen. See #9802.
 
-// CHECK-LABEL: func.func @reduction_broadcast_elementwise_dynamic
+// CHECK-LABEL: util.func public @reduction_broadcast_elementwise_dynamic
 //      CHECK: flow.dispatch.workgroups
 //      CHECK: linalg.generic
 //      CHECK: linalg.generic
@@ -1729,7 +1729,7 @@
 #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 module {
-  func.func @softmax(%arg0: tensor<12x128x128xf32>) -> tensor<12x128x128xf32> {
+  util.func public @softmax(%arg0: tensor<12x128x128xf32>) -> tensor<12x128x128xf32> {
     %cst = arith.constant 1.000000e+00 : f32
     %cst_0 = arith.constant 0.000000e+00 : f32
     %cst_1 = arith.constant -3.40282347E+38 : f32
@@ -1755,10 +1755,10 @@
       %8 = arith.mulf %arg1, %7 : f32
       linalg.yield %8 : f32
     } -> tensor<12x128x128xf32>
-    return %6 : tensor<12x128x128xf32>
+    util.return %6 : tensor<12x128x128xf32>
   }
 }
-// CHECK-LABEL: func @softmax(
+// CHECK-LABEL: util.func public @softmax(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<12x128x128xf32>
 //       CHECK:   %[[DISPATCH0:.+]] = flow.dispatch.workgroups
 //  CHECK-SAME:       (%[[ARG0]])
@@ -1783,7 +1783,7 @@
 //  CHECK-SAME:         ins(%[[GENERIC1]]#0, %[[GENERIC1]]#1 :
 //       CHECK:     flow.dispatch.tensor.store %[[GENERIC2]]
 //       CHECK:     flow.return
-//       CHECK:   return %[[DISPATCH1]]
+//       CHECK:   util.return %[[DISPATCH1]]
 
 // -----
 
@@ -1791,7 +1791,7 @@
 #map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0)>
 #map2 = affine_map<(d0) -> (d0)>
 module {
-  func.func @batchnorm_training(%arg0: tensor<12xf32>, %arg1: tensor<12x12x12x12x12xf32>, %arg2: tensor<12xf32>) -> (tensor<12xf32>, tensor<12xf32>, tensor<12xf32>) {
+  util.func public @batchnorm_training(%arg0: tensor<12xf32>, %arg1: tensor<12x12x12x12x12xf32>, %arg2: tensor<12xf32>) -> (tensor<12xf32>, tensor<12xf32>, tensor<12xf32>) {
     %cst = arith.constant 1.420000e+00 : f32
     %cst_0 = arith.constant 1.450000e+00 : f32
     %cst_1 = arith.constant 1.300000e+00 : f32
@@ -1815,10 +1815,10 @@
       %9 = arith.subf %arg3, %8 : f32
       linalg.yield %5, %6, %9 : f32, f32, f32
     } -> (tensor<12xf32>, tensor<12xf32>, tensor<12xf32>)
-    return %3#0, %3#1, %3#2 : tensor<12xf32>, tensor<12xf32>, tensor<12xf32>
+    util.return %3#0, %3#1, %3#2 : tensor<12xf32>, tensor<12xf32>, tensor<12xf32>
   }
 }
-// CHECK-LABEL: func @batchnorm_training(
+// CHECK-LABEL: util.func public @batchnorm_training(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<12xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<12x12x12x12x12xf32>
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<12xf32>
@@ -1839,17 +1839,17 @@
 //   CHECK-DAG:     flow.dispatch.tensor.store %[[GENERIC1]]#1
 //   CHECK-DAG:     flow.dispatch.tensor.store %[[GENERIC1]]#2
 //       CHECK:     flow.return
-//       CHECK:   return %[[DISPATCH]]#0, %[[DISPATCH]]#1, %[[DISPATCH]]#2
+//       CHECK:   util.return %[[DISPATCH]]#0, %[[DISPATCH]]#1, %[[DISPATCH]]#2
 
 // -----
 
-func.func @set_encoding_op(%arg0 : tensor<?x?xf32>)
+util.func public @set_encoding_op(%arg0 : tensor<?x?xf32>)
     -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> {
   %0 = iree_linalg_ext.set_encoding %arg0
       : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
-  return %0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
+  util.return %0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 }
-//      CHECK: func @set_encoding_op
+//      CHECK: util.func public @set_encoding_op
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
@@ -1872,17 +1872,17 @@
 //      CHECK:   count(%[[WL0:[a-zA-Z0-9]+]]: index, %[[WL1:[a-zA-Z0-9]+]]: index)
 //      CHECK:     %[[X:[a-zA-Z0-9]+]], %[[Y:[a-zA-Z0-9]+]], %[[Z:.+]] = flow.dispatch.workgroup_count_from_slice %[[WL0]], %[[WL1]]
 //      CHECK:     flow.return %[[X]], %[[Y]], %[[Z]]
-//      CHECK:   return %[[DISPATCH]]
+//      CHECK:   util.return %[[DISPATCH]]
 
 // -----
 
-func.func @unset_encoding_op(%arg0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>)
+util.func public @unset_encoding_op(%arg0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>)
     -> tensor<?x?xf32> {
   %0 = iree_linalg_ext.unset_encoding %arg0
       : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
-//      CHECK: func @unset_encoding_op
+//      CHECK: util.func public @unset_encoding_op
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
@@ -1905,12 +1905,12 @@
 //      CHECK:   count(%[[WL0:[a-zA-Z0-9]+]]: index, %[[WL1:[a-zA-Z0-9]+]]: index)
 //      CHECK:     %[[X:[a-zA-Z0-9]+]], %[[Y:[a-zA-Z0-9]+]], %[[Z:.+]] = flow.dispatch.workgroup_count_from_slice %[[WL0]], %[[WL1]]
 //      CHECK:     flow.return %[[X]], %[[Y]], %[[Z]]
-//      CHECK:   return %[[DISPATCH]]
+//      CHECK:   util.return %[[DISPATCH]]
 
 // -----
 
 #map = affine_map<()[s0] -> (-s0 + (s0 ceildiv 16) * 16)>
-func.func @pad_and_set_encoding_op(%arg0 : tensor<?x?xf32>)
+util.func public @pad_and_set_encoding_op(%arg0 : tensor<?x?xf32>)
     -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -1925,11 +1925,11 @@
     } : tensor<?x?xf32> to tensor<?x?xf32>
   %encoding = iree_linalg_ext.set_encoding %pad
       : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
-  return %encoding : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
+  util.return %encoding : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> ((s0 ceildiv 16) * 16)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (-s0 + (s0 ceildiv 16) * 16)>
-//      CHECK: func.func @pad_and_set_encoding
+//      CHECK: util.func public @pad_and_set_encoding
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
@@ -1962,20 +1962,20 @@
 // CHECK-SAME:         %[[WL2:[a-zA-Z0-9]+]]: index, %[[WL3:[a-zA-Z0-9]+]]: index)
 //      CHECK:     %[[X:[a-zA-Z0-9]+]], %[[Y:[a-zA-Z0-9]+]], %[[Z:.+]] = flow.dispatch.workgroup_count_from_slice %[[WL0]], %[[WL1]], %[[WL2]], %[[WL3]]
 //      CHECK:     flow.return %[[X]], %[[Y]], %[[Z]]
-//      CHECK:   return %[[DISPATCH]]
+//      CHECK:   util.return %[[DISPATCH]]
 
 // -----
 
-func.func @unset_encoding_and_slice(
+util.func public @unset_encoding_and_slice(
     %arg0: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>,
     %arg1 : index, %arg2 : index) -> tensor<?x?xf32> {
   %0 = iree_linalg_ext.unset_encoding %arg0
       : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> -> tensor<?x?xf32>
   %1 = tensor.extract_slice %0[0, 0] [%arg1, %arg2] [1, 1]
       : tensor<?x?xf32> to tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-//      CHECK: func @unset_encoding_and_slice
+//      CHECK: util.func public @unset_encoding_and_slice
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index
@@ -2009,7 +2009,7 @@
 #map = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module {
-  func.func @root_on_unset_encoding(
+  util.func public @root_on_unset_encoding(
       %arg0: tensor<784x96xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>,
       %arg1: tensor<96xf32>) -> tensor<784x96xf32> {
     %0 = iree_linalg_ext.unset_encoding %arg0 : tensor<784x96xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> -> tensor<784x96xf32>
@@ -2026,12 +2026,12 @@
       %5 = arith.addf %in, %in_0 : f32
       linalg.yield %5 : f32
     } -> tensor<784x96xf32>
-    return %4 : tensor<784x96xf32>
+    util.return %4 : tensor<784x96xf32>
   }
 }
 //      CHECL: #[[MAP0:.+]] = affine_map<(d0, d1) -> (d1)>
 //      CHECK: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
-//      CHECK: func @root_on_unset_encoding
+//      CHECK: util.func public @root_on_unset_encoding
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<784x96xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<96xf32>
 //      CHECK:   %[[DISPATCH:.+]] = flow.dispatch.workgroups(%[[ARG0]], %[[ARG1]])
@@ -2055,7 +2055,7 @@
 
 // -----
 
-func.func @gemm_encoded(
+util.func public @gemm_encoded(
     %arg0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>,
     %arg1 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32]>>,
     %arg2 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>)
@@ -2066,9 +2066,9 @@
             tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32]>>)
       outs(%arg2 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>)
       -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
-  return %0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
+  util.return %0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
 }
-//      CHECK: func.func @gemm_encoded
+//      CHECK: util.func public @gemm_encoded
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32]>>
 // CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
@@ -2086,7 +2086,7 @@
 
 // -----
 
-func.func @gemm_fill_encoded(
+util.func public @gemm_fill_encoded(
     %arg0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>,
     %arg1 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32]>>)
     -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>> {
@@ -2104,9 +2104,9 @@
             tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32]>>)
       outs(%fill : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>)
       -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
-  return %0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
+  util.return %0 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
 }
-//      CHECK: func.func @gemm_fill_encoded
+//      CHECK: util.func public @gemm_fill_encoded
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = RHS, element_types = [f32, f32, f32]>>
 //      CHECK:   %[[DISPATCH:.+]] = flow.dispatch.workgroups
@@ -2125,13 +2125,13 @@
 
 // -----
 
-func.func @extract_slice1(%arg0 : tensor<5x24x48xf32>) -> tensor<4xf32> {
+util.func public @extract_slice1(%arg0 : tensor<5x24x48xf32>) -> tensor<4xf32> {
   %0 = tensor.extract_slice %arg0[2, 3, 4] [1, 1, 4] [1, 1, 1]
       : tensor<5x24x48xf32> to tensor<4xf32>
-  return %0 : tensor<4xf32>
+  util.return %0 : tensor<4xf32>
 }
 
-// CHECK-LABEL: func.func @extract_slice1(
+// CHECK-LABEL: util.func public @extract_slice1(
 //  CHECK-SAME:   %[[ARG0:.+]]: tensor<5x24x48xf32>)
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 //   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
@@ -2139,11 +2139,11 @@
 //   CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
 //       CHECK:   %[[SLICE:.+]] = flow.tensor.slice %[[ARG0]][%[[C2]], %[[C3]], %[[C4]] for %[[C1]], %[[C1]], %[[C4]]]
 //       CHECK:   %[[RESULT:.+]] = flow.tensor.reshape %[[SLICE]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @clone_fill_ops(%arg0 : tensor<128x256xf32>, %arg1 : tensor<256x512xf32>,
+util.func public @clone_fill_ops(%arg0 : tensor<128x256xf32>, %arg1 : tensor<256x512xf32>,
     %arg2 : tensor<128x256xf32>, %arg3 : tensor<256x512xf32>)
     -> (tensor<128x512xf32>, tensor<128x512xf32>) {
   %0 = tensor.empty() : tensor<128x512xf32>
@@ -2153,9 +2153,9 @@
       outs(%1 : tensor<128x512xf32>) -> tensor<128x512xf32>
   %3 = linalg.matmul ins(%arg2, %arg3 : tensor<128x256xf32>, tensor<256x512xf32>)
       outs(%1 : tensor<128x512xf32>) -> tensor<128x512xf32>
-  return %2, %3 : tensor<128x512xf32>, tensor<128x512xf32>
+  util.return %2, %3 : tensor<128x512xf32>, tensor<128x512xf32>
 }
-// CHECK-LABEL: func @clone_fill_ops(
+// CHECK-LABEL: util.func public @clone_fill_ops(
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<128x256xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: tensor<256x512xf32>
 //  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9].+]]: tensor<128x256xf32>
@@ -2173,7 +2173,7 @@
 
 // -----
 
-func.func @softmax(%source : tensor<12x128x128xf32>) -> tensor<12x128x128xf32> {
+util.func public @softmax(%source : tensor<12x128x128xf32>) -> tensor<12x128x128xf32> {
   %cst = arith.constant 1.000000e+00 : f32
   %cst_0 = arith.constant 0.000000e+00 : f32
   %cst_1 = arith.constant -3.40282347E+38 : f32
@@ -2203,9 +2203,9 @@
     %10 = arith.mulf %in, %9 : f32
     linalg.yield %10 : f32
   } -> tensor<12x128x128xf32>
-  return %8 : tensor<12x128x128xf32>
+  util.return %8 : tensor<12x128x128xf32>
 }
-// CHECK-LABEL: func @softmax(
+// CHECK-LABEL: util.func public @softmax(
 //  CHECK-SAME:     %[[INPUT:.+]]: tensor<12x128x128xf32>)
 //       CHECK:   %[[DISPATCH0:.+]] = flow.dispatch.workgroups
 //  CHECK-SAME:       (%[[INPUT]])
@@ -2240,4 +2240,4 @@
 //  CHECK-SAME:         ins(%[[INPUT]], %[[GENERIC2]] :
 //  CHECK-SAME:         outs(%[[EMPTY2]] :
 //       CHECK:     flow.dispatch.tensor.store %[[GENERIC3]], %[[ARG3]]
-//       CHECK:   return %[[DISPATCH1]]
+//       CHECK:   util.return %[[DISPATCH1]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_default.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_default.mlir
index 9237de9..03d8b16 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_default.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_default.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions, iree-flow-clone-producers-into-dispatch-regions, iree-flow-form-dispatch-workgroups), cse, canonicalize, cse)" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(util.func(iree-flow-form-dispatch-regions, iree-flow-clone-producers-into-dispatch-regions, iree-flow-form-dispatch-workgroups), cse, canonicalize, cse)" %s | FileCheck %s
 
-func.func @no_fuse_quantized(%arg0 : tensor<?x113x113x64xi8>, %arg1 : tensor<3x3x64xi8>,
+util.func public @no_fuse_quantized(%arg0 : tensor<?x113x113x64xi8>, %arg1 : tensor<3x3x64xi8>,
     %arg2 : i32, %arg3 : i32) -> tensor<?x56x56x64xi8> {
   %c0 = arith.constant 0 : index
   %c0_i32 = arith.constant 0 : i32
@@ -19,9 +19,9 @@
       %5 = arith.trunci %b0 : i32 to i8
       linalg.yield %5 : i8
     } -> tensor<?x56x56x64xi8>
-  return %4 : tensor<?x56x56x64xi8>
+  util.return %4 : tensor<?x56x56x64xi8>
 }
-//     CHECK: func.func @no_fuse_quantized
+//     CHECK: util.func public @no_fuse_quantized
 //     CHECK:   flow.dispatch.workgroups
 //     CHECK:   linalg.depthwise_conv_2d_nhwc_hwc_q
 // CHECK-NOT:   linalg.generic
@@ -32,7 +32,7 @@
 
 #map = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @elem_set_encoding(%arg0: tensor<512xf32>, %arg1: tensor<384x512xf32>,
+util.func public @elem_set_encoding(%arg0: tensor<512xf32>, %arg1: tensor<384x512xf32>,
     %arg2: tensor<384x512xf32>) -> tensor<384x512xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> {
   %0 = tensor.empty() : tensor<384x512xf32>
   %1 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map1],
@@ -45,9 +45,9 @@
     linalg.yield %4 : f32
   } -> tensor<384x512xf32>
   %2 = iree_linalg_ext.set_encoding %1 : tensor<384x512xf32> -> tensor<384x512xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
-  return %2 : tensor<384x512xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
+  util.return %2 : tensor<384x512xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 }
-// CHECK-LABEL: func.func @elem_set_encoding
+// CHECK-LABEL: util.func public @elem_set_encoding
 // CHECK:         flow.dispatch.workgroups
 // CHECK:           linalg.generic
 // CHECK:           iree_linalg_ext.set_encoding
@@ -55,7 +55,7 @@
 
 // -----
 
-func.func @fix_dominance_on_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @fix_dominance_on_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : tensor<?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -75,9 +75,9 @@
       %addf = arith.addf %b0, %b1 : f32
       linalg.yield %addf : f32
   } -> tensor<?x?xf32>
-  return %bias_add : tensor<?x?xf32>
+  util.return %bias_add : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @fix_dominance_on_fusion
+// CHECK-LABEL: util.func public @fix_dominance_on_fusion
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.workgroups
 //       CHECK:     %[[EMPTY:.+]] = tensor.empty
 //       CHECK:     %[[FILL:.+]] = linalg.fill
@@ -87,4 +87,4 @@
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:         ins(%[[GEMM]],
 //       CHECK:     flow.dispatch.tensor.store %[[GENERIC]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir
index cc7cc7c..b8e7d1d 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(func.func(iree-flow-interchange-transpose-generic-ops,iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-form-dispatch-workgroups, canonicalize, cse))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(util.func(iree-flow-interchange-transpose-generic-ops,iree-flow-form-dispatch-regions{fuse-multi-use=true}, iree-flow-form-dispatch-workgroups, canonicalize, cse))" %s | FileCheck %s
 
-func.func @fuse_batch_matmul_transpose(%a: tensor<4x384x384xf32>, %b: tensor<4x384x32xf32>) -> tensor<384x4x32xf32> {
+util.func public @fuse_batch_matmul_transpose(%a: tensor<4x384x384xf32>, %b: tensor<4x384x32xf32>) -> tensor<384x4x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<4x384x32xf32>
   %c = linalg.fill ins(%cst : f32) outs(%init : tensor<4x384x32xf32>) -> tensor<4x384x32xf32>
@@ -10,7 +10,7 @@
   ^bb0(%arg0: f32, %arg1: f32):
     linalg.yield %arg0 : f32
   } -> tensor<384x4x32xf32>
-  return %transpose : tensor<384x4x32xf32>
+  util.return %transpose : tensor<384x4x32xf32>
 }
 
 // Check that
@@ -19,7 +19,7 @@
 
 //      CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //      CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
-// CHECK-LABEL: func.func @fuse_batch_matmul_transpose
+// CHECK-LABEL: util.func public @fuse_batch_matmul_transpose
 //      CHECK: flow.dispatch.workgroups
 //      CHECK:   %[[MATMUL:.+]] = linalg.batch_matmul
 //      CHECK:   linalg.generic
@@ -29,7 +29,7 @@
 
 // -----
 
-func.func @fuse_matmul_transpose(%a: tensor<128x384xf32>, %b: tensor<384x384xf32>) -> tensor<384x128xf32> {
+util.func public @fuse_matmul_transpose(%a: tensor<128x384xf32>, %b: tensor<384x384xf32>) -> tensor<384x128xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %cst1 = arith.constant 1.000000e+00 : f32
   %init = tensor.empty() : tensor<128x384xf32>
@@ -41,7 +41,7 @@
     %add = arith.addf %arg0, %cst1 : f32
     linalg.yield %add : f32
   } -> tensor<384x128xf32>
-  return %transpose : tensor<384x128xf32>
+  util.return %transpose : tensor<384x128xf32>
 }
 
 // Check that
@@ -50,7 +50,7 @@
 
 //      CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 //      CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK-LABEL: func.func @fuse_matmul_transpose
+// CHECK-LABEL: util.func public @fuse_matmul_transpose
 //      CHECK: flow.dispatch.workgroups
 //      CHECK:   %[[MATMUL:.+]] = linalg.matmul
 //      CHECK:   linalg.generic

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_transform_dialect.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_transform_dialect.mlir
index f4e61aa..cc0f8ce 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_transform_dialect.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_transform_dialect.mlir

@@ -1,45 +1,45 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-flow-dispatch-with-transform-dialect{transform-file-name=%p/transform_dialect_dispatch_spec.mlir}))" %s | \
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-flow-dispatch-with-transform-dialect{transform-file-name=%p/transform_dialect_dispatch_spec.mlir}))" %s | \
 // RUN: FileCheck %s
 
-func.func @tile_matmul_alone(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @tile_matmul_alone(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
                              %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %1 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
     outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-//      CHECK: func.func @tile_matmul_alone
+//      CHECK: util.func public @tile_matmul_alone
 //      CHECK:   flow.dispatch.workgroups
 
-func.func @tile_matmul_with_constant(
+util.func public @tile_matmul_with_constant(
     %arg1 : tensor<5x10xf32>, %arg2 : tensor<10x10xf32>) -> tensor<10x10xf32> {
   // The constant is cloned and fused into the dispatch region.
   %a = arith.constant dense<1.0> : tensor<10x5xf32>
   %1 = linalg.matmul ins(%a, %arg1 : tensor<10x5xf32>, tensor<5x10xf32>)
     outs(%arg2 : tensor<10x10xf32>) -> tensor<10x10xf32>
-  return %1 : tensor<10x10xf32>
+  util.return %1 : tensor<10x10xf32>
 }
-//      CHECK: func.func @tile_matmul_with_constant
+//      CHECK: util.func public @tile_matmul_with_constant
 //      CHECK:   flow.dispatch.workgroups
 //      CHECK:     arith.constant dense<1.000000e+00> : tensor<10x5xf32>
 
 // Some dummy functions to exercise TSAN under parallelism.
-func.func @foo1() -> index {
+util.func public @foo1() -> index {
   %0 = arith.constant 1 : index
-  return %0 : index
+  util.return %0 : index
 }
-func.func @foo2() -> index {
+util.func public @foo2() -> index {
   %0 = arith.constant 2 : index
-  return %0 : index
+  util.return %0 : index
 }
-func.func @foo3() -> index {
+util.func public @foo3() -> index {
   %0 = arith.constant 3 : index
-  return %0 : index
+  util.return %0 : index
 }
-func.func @foo4() -> index {
+util.func public @foo4() -> index {
   %0 = arith.constant 4 : index
-  return %0 : index
+  util.return %0 : index
 }
-func.func @foo5() -> index {
+util.func public @foo5() -> index {
   %0 = arith.constant 5 : index
-  return %0 : index
+  util.return %0 : index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/export_benchmark_funcs.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/export_benchmark_funcs.mlir
index 959dea7..dfb93d5 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/export_benchmark_funcs.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/export_benchmark_funcs.mlir

@@ -3,29 +3,29 @@
 // Basic usage from the `--iree-native-bindings-support` flag.
 
 // CHECK-LABEL: func private @simpleMul
-func.func @simpleMul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export} {
+util.func public @simpleMul(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.module.export} {
   %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<4xf32>
   %1 = hal.tensor.import %arg1 : !hal.buffer_view -> tensor<4xf32>
   %2 = arith.mulf %0, %1 : tensor<4xf32>
   %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
-  return %3 : !hal.buffer_view
+  util.return %3 : !hal.buffer_view
 }
 
 //      CHECK: util.global private @[[GLOBAL_ARG0:.+]] {inlining_policy = #util.inline.never} : !hal.buffer_view
 //      CHECK: util.global private @[[GLOBAL_ARG1:.+]] {inlining_policy = #util.inline.never} : !hal.buffer_view
 
-//      CHECK: func.func @simpleMul_benchmark() attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "entry"}} {
+//      CHECK: util.func public @simpleMul_benchmark() attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "entry"}} {
 //  CHECK-DAG:   %[[ARG0:.+]] = util.global.load @[[GLOBAL_ARG0]] : !hal.buffer_view
 //  CHECK-DAG:   %[[ARG1:.+]] = util.global.load @[[GLOBAL_ARG1]] : !hal.buffer_view
-// CHECK-NEXT:   %[[RET0:.+]] = call @simpleMul(%[[ARG0]], %[[ARG1]])
+// CHECK-NEXT:   %[[RET0:.+]] = util.call @simpleMul(%[[ARG0]], %[[ARG1]])
 //      CHECK:   util.optimization_barrier %[[RET0]] : !hal.buffer_view
-//      CHECK:   return
+//      CHECK:   util.return
 
 // -----
 
 // Ensures that functions with multiple blocks are handled correctly.
 
-func.func @while(%start: i32, %bound: i32) -> i32 {
+util.func public @while(%start: i32, %bound: i32) -> i32 {
   cf.br ^bb1(%start : i32)
 ^bb1(%0: i32):
   %1 = arith.cmpi slt, %0, %bound : i32
@@ -34,29 +34,29 @@
   %4 = arith.addi %3, %3 : i32
   cf.br ^bb1(%4 : i32)
 ^bb3(%5: i32):
-  return %5 : i32
+  util.return %5 : i32
 }
 
 //     CHECK: util.global private @[[GLOBAL_ARG0:.+]] {inlining_policy = #util.inline.never} = 0 : i32
 //     CHECK: util.global private @[[GLOBAL_ARG1:.+]] {inlining_policy = #util.inline.never} = 0 : i32
 
-//     CHECK: func.func @while_benchmark()
+//     CHECK: util.func public @while_benchmark()
 // CHECK-DAG:   %[[ARG0:.+]] = util.global.load @[[GLOBAL_ARG0]] : i32
 // CHECK-DAG:   %[[ARG1:.+]] = util.global.load @[[GLOBAL_ARG1]] : i32
-//     CHECK:   %[[RET0:.+]] = call @while(%[[ARG0]], %[[ARG1]])
+//     CHECK:   %[[RET0:.+]] = util.call @while(%[[ARG0]], %[[ARG1]])
 //     CHECK:   util.optimization_barrier %[[RET0]] : i32
-//     CHECK:   return
+//     CHECK:   util.return
 
 // -----
 
 // Ensure the tensors we allocate are of the desired type after casting.
 
 // CHECK-LABEL: func private @importBufferViewBitcasting
-func.func @importBufferViewBitcasting(%view: !hal.buffer_view) -> !hal.buffer_view {
+util.func public @importBufferViewBitcasting(%view: !hal.buffer_view) -> !hal.buffer_view {
   %0 = hal.tensor.import %view : !hal.buffer_view -> tensor<2xui32> as tensor<4xi32>
   %1 = arith.muli %0, %0 : tensor<4xi32>
   %2 = hal.tensor.export %1 : tensor<4xi32> -> !hal.buffer_view
-  return %2 : !hal.buffer_view
+  util.return %2 : !hal.buffer_view
 }
 
 //      CHECK: util.global private @[[GLOBAL_ARG0:.+]] {inlining_policy = #util.inline.never} : !hal.buffer_view
@@ -66,11 +66,11 @@
 //  CHECK-DAG:   %[[DNO:.+]] = util.optimization_barrier %[[EXPORT]]
 // CHECK-NEXT:   util.global.store %[[DNO]], @[[GLOBAL_ARG0]]
 
-//      CHECK: func.func @importBufferViewBitcasting_benchmark()
+//      CHECK: util.func public @importBufferViewBitcasting_benchmark()
 //  CHECK-DAG:   %[[ARG0:.+]] = util.global.load @[[GLOBAL_ARG0]] : !hal.buffer_view
-// CHECK-NEXT:   %[[RET0:.+]] = call @importBufferViewBitcasting(%[[ARG0]])
+// CHECK-NEXT:   %[[RET0:.+]] = util.call @importBufferViewBitcasting(%[[ARG0]])
 //      CHECK:   util.optimization_barrier %[[RET0]] : !hal.buffer_view
-//      CHECK:   return
+//      CHECK:   util.return
 
 // -----
 
@@ -78,13 +78,13 @@
 // that'll likely cause confusion ((dispatches 0x0x0 work) "whoa so fast!" :).
 
 // expected-error @+1 {{unsupported buffer view import}}
-func.func @importDynamicBufferView(%view: !hal.buffer_view) -> !hal.buffer_view {
+util.func public @importDynamicBufferView(%view: !hal.buffer_view) -> !hal.buffer_view {
   %dim0 = hal.buffer_view.dim<%view : !hal.buffer_view>[0] : index
   %dim1 = hal.buffer_view.dim<%view : !hal.buffer_view>[1] : index
   %0 = hal.tensor.import %view : !hal.buffer_view -> tensor<?x?x4xf32>{%dim0, %dim1}
   %1 = arith.mulf %0, %0 : tensor<?x?x4xf32>
   %2 = hal.tensor.export %1 : tensor<?x?x4xf32>{%dim0, %dim1} -> !hal.buffer_view
-  return %2 : !hal.buffer_view
+  util.return %2 : !hal.buffer_view
 }
 
 // -----
@@ -92,11 +92,11 @@
 // We should look for export ops to find the storage size (must be static).
 
 // CHECK-LABEL: func private @exportBufferViewInPlace
-func.func @exportBufferViewInPlace(%view: !hal.buffer_view, %storage: !hal.buffer) -> !hal.buffer_view {
+util.func public @exportBufferViewInPlace(%view: !hal.buffer_view, %storage: !hal.buffer) -> !hal.buffer_view {
   %0 = hal.tensor.import %view : !hal.buffer_view -> tensor<4xi32>
   %1 = arith.muli %0, %0 : tensor<4xi32>
   %2 = hal.tensor.export %1 into(%storage : !hal.buffer) : tensor<4xi32> -> !hal.buffer_view
-  return %2 : !hal.buffer_view
+  util.return %2 : !hal.buffer_view
 }
 
 //      CHECK: util.global private @[[GLOBAL_ARG0:.+]] {inlining_policy = #util.inline.never} : !hal.buffer_view
@@ -113,9 +113,9 @@
 //  CHECK-DAG:   %[[DNO1:.+]] = util.optimization_barrier %[[EXPORT1]]
 // CHECK-NEXT:   util.global.store %[[DNO1]], @[[GLOBAL_ARG1]]
 
-//      CHECK: func.func @exportBufferViewInPlace_benchmark()
+//      CHECK: util.func public @exportBufferViewInPlace_benchmark()
 //  CHECK-DAG:   %[[ARG0:.+]] = util.global.load @[[GLOBAL_ARG0]] : !hal.buffer_view
 //  CHECK-DAG:   %[[ARG1:.+]] = util.global.load @[[GLOBAL_ARG1]] : !hal.buffer
-// CHECK-NEXT:   %[[RET0:.+]] = call @exportBufferViewInPlace(%[[ARG0]], %[[ARG1]])
+// CHECK-NEXT:   %[[RET0:.+]] = util.call @exportBufferViewInPlace(%[[ARG0]], %[[ARG1]])
 //      CHECK:   util.optimization_barrier %[[RET0]] : !hal.buffer_view
-//      CHECK:   return
+//      CHECK:   util.return

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fold_unit_dims.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fold_unit_dims.mlir
index 7878e9b..cc9f684 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fold_unit_dims.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fold_unit_dims.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-flow-fold-unit-extent-dims))" %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-flow-fold-unit-extent-dims))" %s | FileCheck %s
 
-func.func @no_fold_unit_dims_in_dispatches(%arg0 : tensor<1x1x10xf32>) -> tensor<1x1x10xf32> {
+util.func public @no_fold_unit_dims_in_dispatches(%arg0 : tensor<1x1x10xf32>) -> tensor<1x1x10xf32> {
   %0 = tensor.empty() : tensor<1x1x10xf32>
   %1 = flow.dispatch.region[] -> (tensor<1x1x10xf32>) {
     %2 = linalg.generic {
@@ -13,11 +13,11 @@
     } -> tensor<1x1x10xf32>
     flow.return %2 : tensor<1x1x10xf32>
   }
-  return %1 : tensor<1x1x10xf32>
+  util.return %1 : tensor<1x1x10xf32>
 }
-//      CHECK: func @no_fold_unit_dims_in_dispatches(%[[ARG0:.+]]: tensor<1x1x10xf32>)
+//      CHECK: util.func public @no_fold_unit_dims_in_dispatches(%[[ARG0:.+]]: tensor<1x1x10xf32>)
 //      CHECK:   %[[DISPATCH:.+]] = flow.dispatch.region
 //      CHECK:     %[[GENERIC:.+]] = linalg.generic
 // CHECK-SAME:         ins(%[[ARG0]] : tensor<1x1x10xf32>)
 //      CHECK:     flow.return %[[GENERIC]]
-//      CHECK:   return %[[DISPATCH]]
+//      CHECK:   util.return %[[DISPATCH]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir
index 537149e..26ddb6d 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-flow-form-dispatch-regions))" --split-input-file %s | FileCheck %s
 
-func.func @pack_elementwise_fusion(%arg0 : tensor<?xf32>,
+util.func public @pack_elementwise_fusion(%arg0 : tensor<?xf32>,
     %arg1 : tensor<?x?xf32>) -> tensor<?x?x8x32xf32> {
   %cst = arith.constant 0.0 : f32
   %c0 = arith.constant 0 : index
@@ -27,9 +27,9 @@
   %9 = tensor.pack %5 padding_value(%cst : f32)
       inner_dims_pos = [0, 1] inner_tiles = [8, 32]
       into %8 : tensor<?x?xf32> -> tensor<?x?x8x32xf32>
-  return %9 : tensor<?x?x8x32xf32>
+  util.return %9 : tensor<?x?x8x32xf32>
 }
-// CHECK-LABEL: func @pack_elementwise_fusion(
+// CHECK-LABEL: util.func public @pack_elementwise_fusion(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //       CHECK:   %[[RETURN:.+]] = flow.dispatch.region
@@ -38,11 +38,11 @@
 //  CHECK-SAME:         ins(%[[ARG1]], %[[ARG0]] :
 //       CHECK:     %[[PACK:.+]] = tensor.pack %[[GENERIC]]
 //       CHECK:     flow.return %[[PACK]]
-//       CHECK:   return %[[RETURN]]
+//       CHECK:   util.return %[[RETURN]]
 
 // -----
 
-func.func @pack_fusion(%arg0 : tensor<?x?xf32>,
+util.func public @pack_fusion(%arg0 : tensor<?x?xf32>,
     %arg1 : tensor<?x?xf32>) -> tensor<?x?x8x32xf32> {
   %cst = arith.constant 0.0 : f32
   %c0 = arith.constant 0 : index
@@ -80,9 +80,9 @@
   %9 = tensor.pack %5 padding_value(%cst : f32)
       inner_dims_pos = [0, 1] inner_tiles = [8, 32]
       into %8 : tensor<?x?xf32> -> tensor<?x?x8x32xf32>
-  return %9 : tensor<?x?x8x32xf32>
+  util.return %9 : tensor<?x?x8x32xf32>
 }
-// CHECK-LABEL: func @pack_fusion(
+// CHECK-LABEL: util.func public @pack_fusion(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //       CHECK:   %[[RETURN:.+]] = flow.dispatch.region
@@ -94,7 +94,7 @@
 //  CHECK-SAME:         ins(%[[ARG1]], %[[REDUCTION]] :
 //       CHECK:     %[[PACK:.+]] = tensor.pack %[[GENERIC]]
 //       CHECK:     flow.return %[[PACK]]
-//       CHECK:   return %[[RETURN]]
+//       CHECK:   util.return %[[RETURN]]
 
 // -----
 
@@ -103,7 +103,7 @@
 #map2 = affine_map<()[s0] -> (s0 ceildiv 8)>
 #map3 = affine_map<()[s0] -> (s0 ceildiv 32)>
 module {
-  func.func @tranpose_pack_fusion(%arg0: tensor<?x?xf32>) -> tensor<?x?x8x32xf32> {
+  util.func public @tranpose_pack_fusion(%arg0: tensor<?x?xf32>) -> tensor<?x?x8x32xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
@@ -118,12 +118,12 @@
     %3 = affine.apply #map3()[%dim_0]
     %4 = tensor.empty(%2, %3) : tensor<?x?x8x32xf32>
     %pack = tensor.pack %1 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 32] into %4 : tensor<?x?xf32> -> tensor<?x?x8x32xf32>
-    return %pack : tensor<?x?x8x32xf32>
+    util.return %pack : tensor<?x?x8x32xf32>
   }
 }
 // No fusion as the CPU backend currently can't handle fusion with transpose
 // between ops.
-// CHECK-LABEL: func @tranpose_pack_fusion(
+// CHECK-LABEL: util.func public @tranpose_pack_fusion(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //       CHECK:   %[[DISPATCH1:.+]] = flow.dispatch.region
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic
@@ -132,11 +132,11 @@
 //       CHECK:   %[[DISPATCH2:.+]] = flow.dispatch.region
 //       CHECK:     %[[PACK:.+]] = tensor.pack %[[DISPATCH1]]
 //       CHECK:     flow.return %[[PACK]]
-//       CHECK:   return %[[DISPATCH2]]
+//       CHECK:   util.return %[[DISPATCH2]]
 
 // -----
 
-func.func @set_encoding_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @set_encoding_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : index, %arg3 : index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> {
   %cst = arith.constant 0.0 : f32
   %c0 = arith.constant 0 : index
@@ -168,9 +168,9 @@
   } -> tensor<?x?xf32>
   %6 = iree_linalg_ext.set_encoding %5
       : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
-  return %6 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
+  util.return %6 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 }
-// CHECK-LABEL: func @set_encoding_fusion(
+// CHECK-LABEL: util.func public @set_encoding_fusion(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //       CHECK:   %[[RETURN:.+]] = flow.dispatch.region
@@ -182,11 +182,11 @@
 //  CHECK-SAME:         ins(%[[ARG1]], %[[REDUCTION]] :
 //       CHECK:     %[[PACK:.+]] = iree_linalg_ext.set_encoding %[[GENERIC]]
 //       CHECK:     flow.return %[[PACK]]
-//       CHECK:   return %[[RETURN]]
+//       CHECK:   util.return %[[RETURN]]
 
 // -----
 
-func.func @set_encoding_pad_fusion(%arg0 : tensor<?x?xf32>,
+util.func public @set_encoding_pad_fusion(%arg0 : tensor<?x?xf32>,
     %arg1 : index, %arg2 : index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> {
   %cst = arith.constant 0.0 : f32
   %0 = tensor.pad %arg0 low[0, 0] high[%arg1, %arg2] {
@@ -195,19 +195,19 @@
   } : tensor<?x?xf32> to tensor<?x?xf32>
   %1 = iree_linalg_ext.set_encoding %0
       : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
-  return %1 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
+  util.return %1 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 }
-// CHECK-LABEL: func @set_encoding_pad_fusion(
+// CHECK-LABEL: util.func public @set_encoding_pad_fusion(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
 //       CHECK:   %[[RETURN:.+]] = flow.dispatch.region
 //       CHECK:     %[[PAD:.+]] = tensor.pad %[[ARG0]]
 //       CHECK:     %[[ENCODING:.+]] = iree_linalg_ext.set_encoding %[[PAD]]
 //       CHECK:     flow.return %[[ENCODING]]
-//       CHECK:   return %[[RETURN]]
+//       CHECK:   util.return %[[RETURN]]
 
 // -----
 
-func.func @set_encoding_pad_elementwise_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @set_encoding_pad_elementwise_fusion(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : index, %arg3 : index) -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> {
   %cst = arith.constant 0.0 : f32
   %c0 = arith.constant 0 : index
@@ -243,9 +243,9 @@
   } : tensor<?x?xf32> to tensor<?x?xf32>
   %7 = iree_linalg_ext.set_encoding %6
       : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
-  return %7 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
+  util.return %7 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 }
-// CHECK-LABEL: func @set_encoding_pad_elementwise_fusion(
+// CHECK-LABEL: util.func public @set_encoding_pad_elementwise_fusion(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?xf32>
 //       CHECK:   %[[RETURN:.+]] = flow.dispatch.region
@@ -258,11 +258,11 @@
 //       CHECK:     %[[PAD:.+]] = tensor.pad %[[GENERIC]]
 //       CHECK:     %[[PACK:.+]] = iree_linalg_ext.set_encoding %[[PAD]]
 //       CHECK:     flow.return %[[PACK]]
-//       CHECK:   return %[[RETURN]]
+//       CHECK:   util.return %[[RETURN]]
 
 // -----
 
-func.func @unset_encoding_elementwise_fusion(
+util.func public @unset_encoding_elementwise_fusion(
     %arg0: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>,
     %arg1: tensor<?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
@@ -283,9 +283,9 @@
       %5 = arith.addf %b0, %b1 : f32
       linalg.yield %5 : f32
     } -> tensor<?x?xf32>
-  return %4 : tensor<?x?xf32>
+  util.return %4 : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @unset_encoding_elementwise_fusion(
+// CHECK-LABEL: util.func public @unset_encoding_elementwise_fusion(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 //  CHECK-SAME:     %[[ARG1:.+]]: tensor<?xf32>)
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.region
@@ -293,11 +293,11 @@
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:         ins(%[[UNSET_ENCODING]], %[[ARG1]]
 //       CHECK:     flow.return %[[GENERIC]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @unset_encoding_slice_elementwise_fusion(
+util.func public @unset_encoding_slice_elementwise_fusion(
     %arg0: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>,
     %arg1: tensor<?xf32>, %arg2 : index, %arg3 : index) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
@@ -319,9 +319,9 @@
       %6 = arith.addf %b0, %b1 : f32
       linalg.yield %6 : f32
     } -> tensor<?x?xf32>
-  return %5 : tensor<?x?xf32>
+  util.return %5 : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @unset_encoding_slice_elementwise_fusion(
+// CHECK-LABEL: util.func public @unset_encoding_slice_elementwise_fusion(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 //  CHECK-SAME:     %[[ARG1:.+]]: tensor<?xf32>
 //       CHECK:   %[[RESULT0:.+]] = flow.dispatch.region
@@ -329,11 +329,11 @@
 //       CHECK:     %[[SLICE:.+]] = tensor.extract_slice %[[UNSET_ENCODING]]
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic {{.*}} ins(%[[SLICE]]
 //       CHECK:     flow.return %[[GENERIC]]
-//       CHECK:   return %[[RESULT0]]
+//       CHECK:   util.return %[[RESULT0]]
 
 // -----
 
-func.func @unpack_encoding_elementwise_fusion(
+util.func public @unpack_encoding_elementwise_fusion(
     %arg0: tensor<?x?x?x?xf32>,
     %arg1: tensor<?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
@@ -360,9 +360,9 @@
       %2 = arith.addf %b0, %b1 : f32
       linalg.yield %2 : f32
     } -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @unpack_encoding_elementwise_fusion(
+// CHECK-LABEL: util.func public @unpack_encoding_elementwise_fusion(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?x?xf32>
 //  CHECK-SAME:     %[[ARG1:.+]]: tensor<?xf32>)
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.region
@@ -370,11 +370,11 @@
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:         ins(%[[UNPACK]], %[[ARG1]]
 //       CHECK:     flow.return %[[GENERIC]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @unpack_non_intersecting_reduction(
+util.func public @unpack_non_intersecting_reduction(
     %arg0: tensor<?x?x?xf32>,
     %arg1: tensor<?xf32>) -> tensor<?xf32> {
   %c0 = arith.constant 0 : index
@@ -400,9 +400,9 @@
       %3 = arith.addf %2, %b2 : f32
       linalg.yield %3 : f32
     } -> tensor<?xf32>
-  return %1 : tensor<?xf32>
+  util.return %1 : tensor<?xf32>
 }
-// CHECK-LABEL: func @unpack_non_intersecting_reduction(
+// CHECK-LABEL: util.func public @unpack_non_intersecting_reduction(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?xf32>
 //  CHECK-SAME:     %[[ARG1:.+]]: tensor<?xf32>)
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.region
@@ -410,11 +410,11 @@
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:         ins(%[[UNPACK]], %[[ARG1]]
 //       CHECK:     flow.return %[[GENERIC]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @data_dependent_shape(%arg0 : tensor<f32>, %arg1 : tensor<2xi32>)
+util.func public @data_dependent_shape(%arg0 : tensor<f32>, %arg1 : tensor<2xi32>)
     -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -430,9 +430,9 @@
     ^bb0(%b0: f32, %b1 : f32):
       linalg.yield %b0 : f32
     } -> tensor<?x?xf32>
-  return %generic : tensor<?x?xf32>
+  util.return %generic : tensor<?x?xf32>
 }
-//      CHECK: func @data_dependent_shape(
+//      CHECK: util.func public @data_dependent_shape(
 // CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<f32>
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<2xi32>
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -452,7 +452,7 @@
 
 // -----
 
-func.func @no_yield_dead_results(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>, %arg2 : tensor<?xf32>) -> tensor<?xf32> {
+util.func public @no_yield_dead_results(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?xf32>, %arg2 : tensor<?xf32>) -> tensor<?xf32> {
   %0:2 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>],
       iterator_types = ["parallel", "reduction"]}
@@ -462,17 +462,17 @@
       %2 = arith.addf %b0, %b2 : f32
       linalg.yield %1, %2 : f32, f32
     } -> (tensor<?xf32>, tensor<?xf32>)
-  return %0#1 : tensor<?xf32>
+  util.return %0#1 : tensor<?xf32>
 }
-// CHECK: func @no_yield_dead_results
+// CHECK: util.func public @no_yield_dead_results
 // CHECK:   %[[RESULT:.+]] = flow.dispatch.region
 // CHECK:     %[[GENERIC:.+]]:2 = linalg.generic
 // CHECK:     flow.return %[[GENERIC]]#1
-// CHECK:   return %[[RESULT]]
+// CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @scf_nested_dispatch(%arg0 : tensor<?xi32>) -> (tensor<?xi32>) {
+util.func public @scf_nested_dispatch(%arg0 : tensor<?xi32>) -> (tensor<?xi32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %dim = tensor.dim %arg0, %c0 : tensor<?xi32>
@@ -489,7 +489,7 @@
     scf.yield %arg0 : tensor<?xi32>
   }
 
-  return %scf : tensor<?xi32>
+  util.return %scf : tensor<?xi32>
 }
 
 // CHECK-LABEL: @scf_nested_dispatch
@@ -501,7 +501,7 @@
 
 // -----
 
-func.func @no_dequantization_fusion(%arg0: tensor<4096x32x128xi8>, %arg1: tensor<1x1x32x128xf32>, %arg2: tensor<4096x32x1xf32>, %arg3: tensor<4096x32x1xf32>) -> tensor<1x1x4096xf32> {
+util.func public @no_dequantization_fusion(%arg0: tensor<4096x32x128xi8>, %arg1: tensor<1x1x32x128xf32>, %arg2: tensor<4096x32x1xf32>, %arg3: tensor<4096x32x1xf32>) -> tensor<1x1x4096xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<1x1x4096xf32>
   %1 = tensor.empty() : tensor<4096x32x128xf32>
@@ -531,9 +531,9 @@
     %6 = arith.addf %5, %out : f32
     linalg.yield %6 : f32
   } -> tensor<1x1x4096xf32>
-  return %4 : tensor<1x1x4096xf32>
+  util.return %4 : tensor<1x1x4096xf32>
 }
-//       CHECK: func.func @no_dequantization_fusion
+//       CHECK: util.func public @no_dequantization_fusion
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<4096x32x128xi8>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<1x1x32x128xf32>
 //  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: tensor<4096x32x1xf32>
@@ -553,25 +553,25 @@
 //  CHECK-SAME:       ins(%[[ARG1]], %[[GEN0]] :
 //  CHECK-SAME:       outs(%[[FILL]] :
 //       CHECK:   flow.return %[[GEN1]] :
-//       CHECK:   return %[[DISP]]
+//       CHECK:   util.return %[[DISP]]
 
 // -----
 
 #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 module {
-  func.func @no_dequantization_like_fusion(%arg0: tensor<32x1x16x1x8xi16>, %arg1: tensor<32x344x16x32x8xi4>) -> tensor<32x1x344x1x32xi32> {
+  util.func public @no_dequantization_like_fusion(%arg0: tensor<32x1x16x1x8xi16>, %arg1: tensor<32x344x16x32x8xi4>) -> tensor<32x1x344x1x32xi32> {
     %c0_i32 = arith.constant 0 : i32
     %0 = tensor.empty() : tensor<32x1x16x1x8xi32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], 
-                         iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} 
+    %1 = linalg.generic {indexing_maps = [#map, #map],
+                         iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
                          ins(%arg0 : tensor<32x1x16x1x8xi16>) outs(%0 : tensor<32x1x16x1x8xi32>) {
     ^bb0(%in: i16, %out: i32):
       %7 = arith.extsi %in : i16 to i32
       linalg.yield %7 : i32
     } -> tensor<32x1x16x1x8xi32>
     %2 = tensor.empty() : tensor<32x344x16x32x8xi32>
-    %3 = linalg.generic {indexing_maps = [#map, #map], 
-                         iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} 
+    %3 = linalg.generic {indexing_maps = [#map, #map],
+                         iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]}
                          ins(%arg1 : tensor<32x344x16x32x8xi4>) outs(%2 : tensor<32x344x16x32x8xi32>) {
     ^bb0(%in: i4, %out: i32):
       %7 = arith.extui %in : i4 to i32
@@ -580,10 +580,10 @@
     %4 = tensor.empty() : tensor<32x1x344x1x32xi32>
     %5 = linalg.fill ins(%c0_i32 : i32) outs(%4 : tensor<32x1x344x1x32xi32>) -> tensor<32x1x344x1x32xi32>
     %7 = linalg.batch_mmt4d ins(%1, %3 : tensor<32x1x16x1x8xi32>, tensor<32x344x16x32x8xi32>) outs(%5 : tensor<32x1x344x1x32xi32>) -> tensor<32x1x344x1x32xi32>
-    return %7 : tensor<32x1x344x1x32xi32>
+    util.return %7 : tensor<32x1x344x1x32xi32>
   }
 }
-//       CHECK: func.func @no_dequantization_like_fusion
+//       CHECK: util.func public @no_dequantization_like_fusion
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<32x1x16x1x8xi16>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<32x344x16x32x8xi4>
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : i32
@@ -605,4 +605,4 @@
 //  CHECK-SAME:       ins(%[[GEN0]], %[[GEN1]] :
 //  CHECK-SAME:       outs(%[[FILL]] :
 //       CHECK:   flow.return %[[MMT4D]] :
-//       CHECK:   return %[[DISP]]
+//       CHECK:   util.return %[[DISP]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_workgroups.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_workgroups.mlir
index adabc66..98b67e5 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_workgroups.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_workgroups.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-workgroups))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-flow-form-dispatch-workgroups))" --split-input-file %s | FileCheck %s
 
-func.func @existing_count_region(%arg0 : index, %arg1 : index) -> tensor<?x?xf32> {
+util.func public @existing_count_region(%arg0 : index, %arg1 : index) -> tensor<?x?xf32> {
   %c1 = arith.constant 1 : index
   %0 = flow.dispatch.region[%arg0, %arg1] -> (tensor<?x?xf32>{%arg0, %arg1}) {
     %1 = tensor.empty(%arg0, %arg1) : tensor<?x?xf32>
@@ -8,16 +8,16 @@
   } count(%arg2 : index, %arg3 : index) -> (index, index, index) {
     flow.return %arg2, %arg3, %c1 : index, index, index
   }
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @existing_count_region(
+// CHECK-LABEL: util.func public @existing_count_region(
 //       CHECK:   count(%[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: index)
 //       CHECK:     %[[C1:.+]] = arith.constant 1 : index
 //       CHECK:     flow.return %[[ARG2]], %[[ARG3]], %[[C1]]
 
 // -----
 
-func.func @simple_test_with_cfg(%arg0: i1) -> (tensor<10x20xf32>) {
+util.func public @simple_test_with_cfg(%arg0: i1) -> (tensor<10x20xf32>) {
   %cst = arith.constant dense<1.000000e+00> : tensor<10x20xf32>
   %0 = flow.dispatch.region -> (tensor<10x20xf32>) {
     %cst_0 = arith.constant dense<1.000000e+00> : tensor<10x20xf32>
@@ -28,9 +28,9 @@
   ^bb2:  // pred: ^bb0
     flow.return %cst_0 : tensor<10x20xf32>
   }
-  return %0 : tensor<10x20xf32>
+  util.return %0 : tensor<10x20xf32>
 }
-// CHECK-LABEL: func @simple_test_with_cfg
+// CHECK-LABEL: util.func public @simple_test_with_cfg
 //  CHECK-SAME:     %[[ARG0:.+]]: i1
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.workgroups(%[[ARG0]])
 //  CHECK-NEXT:       %[[ARG1:.+]]: i1, %[[ARG2:.+]]: !flow.dispatch.tensor
@@ -42,4 +42,4 @@
 //       CHECK:     ^[[BB2:.+]]:
 //       CHECK:       flow.dispatch.tensor.store %[[CST]], %[[ARG2]]
 //       CHECK:       flow.return
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_scalar_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_scalar_dispatches.mlir
index b918a79..c54d8f7 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_scalar_dispatches.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_scalar_dispatches.mlir

@@ -1,7 +1,7 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-flow-form-scalar-dispatches))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-flow-form-scalar-dispatches))" --split-input-file %s | FileCheck %s
 
 #map = affine_map<() -> ()>
-func.func @simpleDAG(
+util.func public @simpleDAG(
     %arg0 : tensor<f32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>, %arg3 : tensor<f32>)
     -> (tensor<f32>, tensor<f32>) {
   %0 = tensor.empty() : tensor<f32>
@@ -23,9 +23,9 @@
       %6 = arith.subf %b1, %b0 : f32
       linalg.yield %6 : f32
     } -> tensor<f32>
-  return %1, %5 : tensor<f32>, tensor<f32>
+  util.return %1, %5 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: func @simpleDAG(
+// CHECK-LABEL: util.func public @simpleDAG(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<f32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<f32>
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<f32>
@@ -41,12 +41,12 @@
 //       CHECK:     count() -> (index, index, index)
 //  CHECK-NEXT:       %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-NEXT:       flow.return %[[C1]], %[[C1]], %[[C1]]
-//       CHECK:   return %[[RESULT]]#1, %[[RESULT]]#0
+//       CHECK:   util.return %[[RESULT]]#1, %[[RESULT]]#0
 
 // -----
 
 #map = affine_map<() -> ()>
-func.func @simpleHorizontal(
+util.func public @simpleHorizontal(
     %arg0 : tensor<f32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>, %arg3 : tensor<f32>)
     -> (tensor<f32>, tensor<f32>) {
   %0 = tensor.empty() : tensor<f32>
@@ -68,9 +68,9 @@
       %6 = arith.addf %b0, %b0 : f32
       linalg.yield %6 : f32
     } -> tensor<f32>
-  return %3, %5 : tensor<f32>, tensor<f32>
+  util.return %3, %5 : tensor<f32>, tensor<f32>
 }
-// CHECK-LABEL: func @simpleHorizontal
+// CHECK-LABEL: util.func public @simpleHorizontal
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<f32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<f32>
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<f32>
@@ -86,7 +86,7 @@
 //       CHECK:     count() -> (index, index, index)
 //  CHECK-NEXT:       %[[C1:.+]] = arith.constant 1 : index
 //  CHECK-NEXT:       flow.return %[[C1]], %[[C1]], %[[C1]]
-//       CHECK:   return %[[RESULT]]#1, %[[RESULT]]#0
+//       CHECK:   util.return %[[RESULT]]#1, %[[RESULT]]#0
 
 // -----
 
@@ -94,7 +94,7 @@
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0)>
 #map3 = affine_map<(d0) -> (d0)>
-func.func @interleaving(
+util.func public @interleaving(
       %arg0 : tensor<1x1xf32>, %arg1 : tensor<1xf32>, %arg2 : tensor<f32>, %arg3 : tensor<f32>)
       -> (tensor<f32>, tensor<1xf32>) {
     %cst = arith.constant 0.0 : f32
@@ -128,9 +128,9 @@
         %10 = arith.divf %b1, %b0 : f32
         linalg.yield %10 : f32
       } -> tensor<f32>
-    return %9, %7 : tensor<f32>, tensor<1xf32>
+    util.return %9, %7 : tensor<f32>, tensor<1xf32>
 }
-// CHECK-LABEL: func @interleaving(
+// CHECK-LABEL: util.func public @interleaving(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<1x1xf32>,
 //  CHECK-SAME:     %[[ARG1:.+]]: tensor<1xf32>,
 //  CHECK-SAME:     %[[ARG2:.+]]: tensor<f32>,
@@ -156,4 +156,4 @@
 //  CHECK-SAME:         ins(%[[DISPATCH0]]#0, %[[ARG3]] :
 //  CHECK-SAME:         outs(%[[EMPTY1]] :
 //       CHECK:    flow.return %[[GENERIC3]], %[[GENERIC2]]
-//       CHECK:  return %[[DISPATCH1]]#0, %[[DISPATCH1]]#1
+//       CHECK:  util.return %[[DISPATCH1]]#0, %[[DISPATCH1]]#1

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fusion_of_tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fusion_of_tensor_ops.mlir
index 848f241..f3741b6 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fusion_of_tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fusion_of_tensor_ops.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-flow-fusion-of-tensor-ops{fuse-multi-use=true}))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(util.func(iree-flow-fusion-of-tensor-ops{fuse-multi-use=true}))" %s | FileCheck %s
 
-func.func @softmax(%arg0 : tensor<12x128x128xf32>) -> tensor<12x128x128xf32> {
+util.func public @softmax(%arg0 : tensor<12x128x128xf32>) -> tensor<12x128x128xf32> {
   %cst = arith.constant 1.000000e+00 : f32
   %cst_0 = arith.constant 0.000000e+00 : f32
   %cst_1 = arith.constant -3.40282347E+38 : f32
@@ -38,9 +38,9 @@
     %11 = arith.mulf %b0, %b1 : f32
     linalg.yield %11 : f32
   } -> tensor<12x128x128xf32>
-  return %10 : tensor<12x128x128xf32>
+  util.return %10 : tensor<12x128x128xf32>
 }
-// CHECK-LABEL: func.func @softmax
+// CHECK-LABEL: util.func public @softmax
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<12x128x128xf32>
 //       CHECK:   %[[INIT0:.+]] = tensor.empty()
 //       CHECK:   %[[FILL0:.+]] = linalg.fill
@@ -63,11 +63,11 @@
 //       CHECK:   %[[GENERIC3:.+]] = linalg.generic
 //  CHECK-SAME:       ins(%[[GENERIC1]], %[[GENERIC2]] :
 //  CHECK-SAME:       outs(%[[INIT1]] :
-//       CHECK:   return %[[GENERIC3]]
+//       CHECK:   util.return %[[GENERIC3]]
 
 // -----
 
-func.func @batchnorm_training(%10 : tensor<12xf32>, %11 : tensor<12x12x12x12x12xf32>, %12 : tensor<12xf32>) -> (tensor<12xf32>, tensor<12xf32>, tensor<12xf32>)
+util.func public @batchnorm_training(%10 : tensor<12xf32>, %11 : tensor<12x12x12x12x12xf32>, %12 : tensor<12xf32>) -> (tensor<12xf32>, tensor<12xf32>, tensor<12xf32>)
 {
   %cst = arith.constant 1.42 : f32
   %cst_1 = arith.constant 1.45 : f32
@@ -111,9 +111,9 @@
       %21 = arith.subf %arg1, %20 : f32
       linalg.yield %21 : f32
     } -> tensor<12xf32>
-  return %16, %17, %18 : tensor<12xf32>, tensor<12xf32>, tensor<12xf32>
+  util.return %16, %17, %18 : tensor<12xf32>, tensor<12xf32>, tensor<12xf32>
 }
-// CHECK-LABEL: func @batchnorm_training(
+// CHECK-LABEL: util.func public @batchnorm_training(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<12xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<12x12x12x12x12xf32>
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<12xf32>
@@ -126,13 +126,13 @@
 //       CHECK:   %[[GENERIC1:.+]]:3 = linalg.generic
 //  CHECK-SAME:       ins(%[[ARG0]], %[[GENERIC0]] :
 //  CHECK-SAME:       outs(%[[INIT]], %[[INIT]], %[[INIT]] :
-//       CHECK:   return %[[GENERIC1]]#0, %[[GENERIC1]]#1, %[[GENERIC1]]#2
+//       CHECK:   util.return %[[GENERIC1]]#0, %[[GENERIC1]]#1, %[[GENERIC1]]#2
 
 // -----
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
-  func.func @fuse_only_with_same_marker(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>) -> (tensor<5x5xf32>, tensor<5x5xf32>, tensor<5x5xf32>, tensor<5x5xf32>) {
+  util.func public @fuse_only_with_same_marker(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>) -> (tensor<5x5xf32>, tensor<5x5xf32>, tensor<5x5xf32>, tensor<5x5xf32>) {
     %cst = arith.constant 1.000000e+00 : f32
     %cst_0 = arith.constant 2.000000e+00 : f32
     %cst_1 = arith.constant 3.000000e+00 : f32
@@ -160,10 +160,10 @@
       %8 = arith.subf %arg2, %arg3 : f32
       linalg.yield %8 : f32
     } -> tensor<5x5xf32>
-    return %4, %5, %6, %7 : tensor<5x5xf32>, tensor<5x5xf32>, tensor<5x5xf32>, tensor<5x5xf32>
+    util.return %4, %5, %6, %7 : tensor<5x5xf32>, tensor<5x5xf32>, tensor<5x5xf32>, tensor<5x5xf32>
   }
 }
-// CHECK-LABEL: func.func @fuse_only_with_same_marke
+// CHECK-LABEL: util.func public @fuse_only_with_same_marke
 // CHECK:         linalg.generic
 // CHECK-NOT:     linalg.generic
 
@@ -175,7 +175,7 @@
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d3, d4, d5)>
 #map3 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2)>
 module {
-  func.func @fuse_only_projected_perm(%arg0: tensor<16x1082x1922xi8>, %arg1: tensor<32x16x3x3xf32>, %arg2: tensor<32x1080x1920xi32>) -> tensor<32x1080x1920xi32> {
+  util.func public @fuse_only_projected_perm(%arg0: tensor<16x1082x1922xi8>, %arg1: tensor<32x16x3x3xf32>, %arg2: tensor<32x1080x1920xi32>) -> tensor<32x1080x1920xi32> {
     %0 = tensor.empty() : tensor<32x16x3x3xi8>
     %eltwise = linalg.generic {
              indexing_maps = [#map0, #map0],
@@ -200,10 +200,10 @@
       linalg.yield %235 : i32
     } -> tensor<32x1080x1920xi32>
 
-    return %conv : tensor<32x1080x1920xi32>
+    util.return %conv : tensor<32x1080x1920xi32>
   }
 }
-// CHECK-LABEL: func.func @fuse_only_projected_perm
+// CHECK-LABEL: util.func public @fuse_only_projected_perm
 // CHECK:         linalg.generic
 // CHECK:         linalg.generic
 
@@ -214,7 +214,7 @@
 #map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3, d0)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 module {
-  func.func @nofuse_broadcast_compute(%arg0: tensor<702x702x128xf32>, %arg1: tensor<702x702x128xf32>,
+  util.func public @nofuse_broadcast_compute(%arg0: tensor<702x702x128xf32>, %arg1: tensor<702x702x128xf32>,
       %arg2: tensor<702x702x128xf32>, %arg3: tensor<702x702x128xf32>) -> tensor<128x702x702xf32> {
     %cst = arith.constant dense<1.000000e+00> : tensor<702x702x128xf32>
     %cst_0 = arith.constant 0.000000e+00 : f32
@@ -252,10 +252,10 @@
       %10 = arith.addf %out, %9 : f32
       linalg.yield %10 : f32
     } -> tensor<128x702x702xf32>
-    return %8 : tensor<128x702x702xf32>
+    util.return %8 : tensor<128x702x702xf32>
   }
 }
-// CHECK-LABEL: func @nofuse_broadcast_compute(
+// CHECK-LABEL: util.func public @nofuse_broadcast_compute(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<702x702x128xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<702x702x128xf32>
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<702x702x128xf32>
@@ -273,11 +273,11 @@
 //       CHECK:   %[[GENERIC2:.+]] = linalg.generic
 //  CHECK-SAME:       ins(%[[GENERIC1]], %[[GENERIC0]] :
 //  CHECK-SAME:       outs(%[[FILL]] :
-//       CHECK:   return %[[GENERIC2]]
+//       CHECK:   util.return %[[GENERIC2]]
 
 // -----
 
-func.func @fuse_iota_ops(%arg0: tensor<10x20xi32>) -> (tensor<10x20xi32>, tensor<10x20xi32>) {
+util.func public @fuse_iota_ops(%arg0: tensor<10x20xi32>) -> (tensor<10x20xi32>, tensor<10x20xi32>) {
   %c20 = arith.constant 20 : index
   %0 = tensor.empty() : tensor<10x20xi32>
   %1 = tensor.empty() : tensor<10x20xindex>
@@ -310,9 +310,9 @@
       %9 = arith.muli %8, %b0 : i32
       linalg.yield %9 : i32
     } -> tensor<10x20xi32>
-    return %7, %8 : tensor<10x20xi32>, tensor<10x20xi32>
+    util.return %7, %8 : tensor<10x20xi32>, tensor<10x20xi32>
 }
-// CHECK-LABEL: func @fuse_iota_ops(
+// CHECK-LABEL: util.func public @fuse_iota_ops(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<10x20xi32>)
 //       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<10x20xi32>
 //       CHECK:   %[[GENERIC1:.+]] = linalg.generic
@@ -329,11 +329,11 @@
 //       CHECK:     linalg.index
 //       CHECK:     arith.muli
 //       CHECK:     linalg.yield
-//       CHECK:   return %[[GENERIC1]], %[[GENERIC2]]
+//       CHECK:   util.return %[[GENERIC1]], %[[GENERIC2]]
 
 // -----
 
-func.func @no_fuse_within_dispatch(%arg0 : tensor<10x20xf32>) -> tensor<10x20xf32> {
+util.func public @no_fuse_within_dispatch(%arg0 : tensor<10x20xf32>) -> tensor<10x20xf32> {
   %0 = flow.dispatch.region[] -> (tensor<10x20xf32>) {
     %1 = tensor.empty() : tensor<10x20xf32>
     %2 = linalg.generic {
@@ -355,18 +355,18 @@
     } -> tensor<10x20xf32>
     flow.return %3 : tensor<10x20xf32>
   }
-  return %0 : tensor<10x20xf32>
+  util.return %0 : tensor<10x20xf32>
 }
-// CHECK-LABEL: func @no_fuse_within_dispatch
+// CHECK-LABEL: util.func public @no_fuse_within_dispatch
 //       CHECK:   %[[RETURN:.+]] = flow.dispatch.region
 //       CHECK:     linalg.generic
 //       CHECK:     %[[GENERIC:.+]] = linalg.generic
 //       CHECK:     flow.return %[[GENERIC]]
-//       CHECK:   return %[[RETURN]]
+//       CHECK:   util.return %[[RETURN]]
 
 // -----
 
-func.func @nofuse_by_expand_dequant(%arg0 : tensor<11008x4096xi4>, %arg1 : tensor<11008x32x1xf16>, %arg2 : tensor<11008x32x1xf16>) -> (tensor<11008xf16>) {
+util.func public @nofuse_by_expand_dequant(%arg0 : tensor<11008x4096xi4>, %arg1 : tensor<11008x32x1xf16>, %arg2 : tensor<11008x32x1xf16>) -> (tensor<11008xf16>) {
   %cst_1 = arith.constant 0.000000e+00 : f16
   %0 = tensor.empty() : tensor<11008x32x128xf16>
   %1 = arith.constant dense<0.000000e+00> : tensor<1x1x32x128xf16>
@@ -390,9 +390,9 @@
     %13 = arith.addf %12, %out : f16
     linalg.yield %13 : f16
   } -> tensor<11008xf16>
-  return %5 : tensor<11008xf16>
+  util.return %5 : tensor<11008xf16>
 }
-//     CHECK-LABEL: func.func @nofuse_by_expand_dequant
+//     CHECK-LABEL: util.func public @nofuse_by_expand_dequant
 //   CHECK-COUNT-2:   tensor.collapse_shape
 //           CHECK:     %[[DEQUANT:.+]] = linalg.generic
 //      CHECK-SAME:         iterator_types = ["parallel", "parallel", "parallel"]
@@ -410,7 +410,7 @@
 #map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>
 #map5 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>
 #map6 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
-func.func @nofuse_by_collapse_matmul(%arg0: tensor<1x1xi64>, %arg1: tensor<4096x32x128xi4>, %arg2: tensor<4096x32x1xf16>, %arg3: tensor<4096x32x1xf16>) -> tensor<1x1x4096xf16> {
+util.func public @nofuse_by_collapse_matmul(%arg0: tensor<1x1xi64>, %arg1: tensor<4096x32x128xi4>, %arg2: tensor<4096x32x1xf16>, %arg3: tensor<4096x32x1xf16>) -> tensor<1x1x4096xf16> {
   %cst = arith.constant 0.000000e+00 : f16
   %c32000 = arith.constant 32000 : index
   %c0_i64 = arith.constant 0 : i64
@@ -456,14 +456,14 @@
     %12 = arith.addf %11, %out : f16
     linalg.yield %12 : f16
   } -> tensor<1x1x4096xf16>
-  return %10 : tensor<1x1x4096xf16>
+  util.return %10 : tensor<1x1x4096xf16>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, 0)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>
 //  CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
-//      CHECK: func.func @nofuse_by_collapse_matmul
+//      CHECK: util.func public @nofuse_by_collapse_matmul
 //      CHECK:   %[[DEQUANT:.+]] = linalg.generic {indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP1]],  #[[MAP]]],
 // CHECK-SAME:     iterator_types = ["parallel", "parallel", "parallel"]
 //  CHECK-NOT:   tensor.collapse_shape %[[DEQUANT]]
@@ -473,7 +473,7 @@
 //  CHECK-NOT:   tensor.expand_shape %[[MATVEC]]
 // -----
 
-func.func @math_sin() {
+util.func public @math_sin() {
   %cst = arith.constant 2.000000e+00 : f32
   %cst_0 = arith.constant dense<[0.000000e+00, 6.349640e-01, -6.349640e-01, 6.349640e-01]> : tensor<4xf32>
   %cst_1 = arith.constant dense<[0.000000e+00, 1.298460e+00, 1.298460e+00, -1.298460e+00]> : tensor<4xf32>
@@ -507,9 +507,9 @@
   } -> tensor<4xf32>
   check.expect_almost_eq(%4#1, %cst_1) : tensor<4xf32>
   check.expect_almost_eq(%5, %cst_0) : tensor<4xf32>
-  return
+  util.return
 }
-// CHECK-LABEL: func @math_sin()
+// CHECK-LABEL: util.func public @math_sin()
 //       CHECK:   %[[GENERIC:.+]]:2 = linalg.generic
 //   CHECK-DAG:   check.expect_almost_eq(%[[GENERIC]]#0,
 //   CHECK-DAG:   check.expect_almost_eq(%[[GENERIC]]#1,
@@ -517,7 +517,7 @@
 // -----
 
 // Check for fix for https://github.com/openxla/iree/issues/14953
-func.func @fix_issue_14953(%arg0: tensor<11008x32x1xf16>, %arg1: tensor<11008x32x1xf16>, %arg2: tensor<1x1x32x128xf16>) -> tensor<1x1x11008xf16> {
+util.func public @fix_issue_14953(%arg0: tensor<11008x32x1xf16>, %arg1: tensor<11008x32x1xf16>, %arg2: tensor<1x1x32x128xf16>) -> tensor<1x1x11008xf16> {
   %cst = arith.constant 0.000000e+00 : f16
   %cst_0 = arith.constant dense<0> : tensor<11008x32x128xi4>
   %3 = util.optimization_barrier %cst_0 : tensor<11008x32x128xi4>
@@ -545,9 +545,9 @@
     flow.return %10 : tensor<11008xf16>
   }
   %expanded = tensor.expand_shape %7 [[0, 1, 2]] : tensor<11008xf16> into tensor<1x1x11008xf16>
-  return %expanded : tensor<1x1x11008xf16>
+  util.return %expanded : tensor<1x1x11008xf16>
 }
-// CHECK-LABEL: func @fix_issue_14953
+// CHECK-LABEL: util.func public @fix_issue_14953
 //       CHECK:   flow.dispatch.region
 //       CHECK:     %[[GENERIC0:.+]] = linalg.generic
 //  CHECK-SAME:         iterator_types = ["parallel", "parallel", "parallel"]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/initialize_empty_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/initialize_empty_tensors.mlir
index 1b9a1f5..1050f15 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/initialize_empty_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/initialize_empty_tensors.mlir

@@ -1,31 +1,31 @@
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-flow-initialize-empty-tensors{zero-fill=true}))' --split-input-file %s | FileCheck %s --check-prefix=ZERO-CHECK
-// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-flow-initialize-empty-tensors{zero-fill=false}))' --split-input-file %s | FileCheck %s --check-prefix=EMPTY-CHECK
+// RUN: iree-opt --pass-pipeline='builtin.module(util.func(iree-flow-initialize-empty-tensors{zero-fill=true}))' --split-input-file %s | FileCheck %s --check-prefix=ZERO-CHECK
+// RUN: iree-opt --pass-pipeline='builtin.module(util.func(iree-flow-initialize-empty-tensors{zero-fill=false}))' --split-input-file %s | FileCheck %s --check-prefix=EMPTY-CHECK
 
-func.func @return_zero_init(%arg0 : index, %arg1 : index) -> (tensor<?x?x42xi32>, tensor<?x42x?xf32>) {
+util.func public @return_zero_init(%arg0 : index, %arg1 : index) -> (tensor<?x?x42xi32>, tensor<?x42x?xf32>) {
   %0 = tensor.empty(%arg0, %arg1) : tensor<?x?x42xi32>
   %1 = tensor.empty(%arg1, %arg0) : tensor<?x42x?xf32>
-  return %0, %1 : tensor<?x?x42xi32>, tensor<?x42x?xf32>
+  util.return %0, %1 : tensor<?x?x42xi32>, tensor<?x42x?xf32>
 }
 
-//      ZERO-CHECK: func.func @return_zero_init(
+//      ZERO-CHECK: util.func public @return_zero_init(
 // ZERO-CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index
 // ZERO-CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
 //  ZERO-CHECK-DAG:   %[[ZERO_INT:.+]] = arith.constant 0 : i32
 //  ZERO-CHECK-DAG:   %[[ZERO_FLOAT:.+]] = arith.constant 0.000000e+00 : f32
 //  ZERO-CHECK-DAG:   %[[SPLAT_INT:.+]] = flow.tensor.splat %[[ZERO_INT]] : tensor<?x?x42xi32>{%[[ARG0]], %[[ARG1]]}
 //  ZERO-CHECK-DAG:   %[[SPLAT_FLOAT:.+]] = flow.tensor.splat %[[ZERO_FLOAT]] : tensor<?x42x?xf32>{%[[ARG1]], %[[ARG0]]}
-//      ZERO-CHECK:   return %[[SPLAT_INT]], %[[SPLAT_FLOAT]]
+//      ZERO-CHECK:   util.return %[[SPLAT_INT]], %[[SPLAT_FLOAT]]
 
-//      EMPTY-CHECK: func.func @return_zero_init(
+//      EMPTY-CHECK: util.func public @return_zero_init(
 // EMPTY-CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index
 // EMPTY-CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
 //  EMPTY-CHECK-DAG:   %[[EMPTY_INT:.+]] = flow.tensor.empty : tensor<?x?x42xi32>{%[[ARG0]], %[[ARG1]]}
 //  EMPTY-CHECK-DAG:   %[[EMPTY_FLOAT:.+]] = flow.tensor.empty : tensor<?x42x?xf32>{%[[ARG1]], %[[ARG0]]}
-//      EMPTY-CHECK:   return %[[EMPTY_INT]], %[[EMPTY_FLOAT]]
+//      EMPTY-CHECK:   util.return %[[EMPTY_INT]], %[[EMPTY_FLOAT]]
 
 // -----
 
-func.func @empty_within_dispatch_workgroup(%arg0: index, %arg1: index) -> tensor<?x?xf32> {
+util.func public @empty_within_dispatch_workgroup(%arg0: index, %arg1: index) -> tensor<?x?xf32> {
   %0 = flow.dispatch.workgroups[%arg0, %arg1](%arg0, %arg1, %arg0, %arg1) : (index, index, index, index) -> tensor<?x?xf32>{%arg0, %arg1} =
     (%arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>) {
       %1 = tensor.empty(%arg4, %arg5) : tensor<?x?xf32>
@@ -36,14 +36,14 @@
       %c1 = arith.constant 1 : index
       flow.return %arg2, %arg3, %c1 : index, index, index
     }
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
-// ZERO-CHECK-LABEL: func.func @empty_within_dispatch_workgroup(
+// ZERO-CHECK-LABEL: util.func public @empty_within_dispatch_workgroup(
 //       ZERO-CHECK:   flow.dispatch.workgroup
 //       ZERO-CHECK:   tensor.empty
 //       ZERO-CHECK:   flow.return
 
-// EMPTY-CHECK-LABEL: func.func @empty_within_dispatch_workgroup(
+// EMPTY-CHECK-LABEL: util.func public @empty_within_dispatch_workgroup(
 //       EMPTY-CHECK:   flow.dispatch.workgroup
 //       EMPTY-CHECK:   tensor.empty
 //       EMPTY-CHECK:   flow.return

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/inject_dispatch_tracing.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/inject_dispatch_tracing.mlir
index c090fd6..870b1bc 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/inject_dispatch_tracing.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/inject_dispatch_tracing.mlir

@@ -1,22 +1,22 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-flow-inject-dispatch-tracing))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(util.func(iree-flow-inject-dispatch-tracing))' %s | FileCheck %s
 
-// CHECK-LABEL: func.func @singleDispatch
+// CHECK-LABEL: util.func public @singleDispatch
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4xf32>)
-func.func @singleDispatch(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+util.func public @singleDispatch(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   %c4 = arith.constant 4 : index
   //      CHECK: flow.tensor.trace "ex::entry0 inputs" = [%[[ARG0]] : tensor<4xf32>]
   // CHECK-NEXT: %[[RET0:.+]] = flow.dispatch @ex::@entry0[%c4](%[[ARG0]]) : (tensor<4xf32>) -> tensor<4xf32>
   %0 = flow.dispatch @ex::@entry0[%c4](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: flow.tensor.trace "ex::entry0 outputs" = [%[[RET0]] : tensor<4xf32>]
-  // CHECK-NEXT: return %[[RET0]]
-  return %0 : tensor<4xf32>
+  // CHECK-NEXT: util.return %[[RET0]]
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @multiDispatch
+// CHECK-LABEL: util.func public @multiDispatch
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4xf32>)
-func.func @multiDispatch(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+util.func public @multiDispatch(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   %c4 = arith.constant 4 : index
 
   //      CHECK: flow.tensor.trace "ex::entry0 inputs" = [%[[ARG0]] : tensor<4xf32>]
@@ -29,6 +29,6 @@
   %1 = flow.dispatch @ex::@entry1[%c4](%0) : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: flow.tensor.trace "ex::entry1 outputs" = [%[[RET1]] : tensor<4xf32>]
 
-  // CHECK: return %[[RET1]]
-  return %1 : tensor<4xf32>
+  // CHECK: util.return %[[RET1]]
+  util.return %1 : tensor<4xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/insert_dispatch_debug_markers.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/insert_dispatch_debug_targets.mlir
similarity index 80%
rename from compiler/src/iree/compiler/Dialect/Flow/Transforms/test/insert_dispatch_debug_markers.mlir
rename to compiler/src/iree/compiler/Dialect/Flow/Transforms/test/insert_dispatch_debug_targets.mlir
index c15f268..3ba7dc5 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/insert_dispatch_debug_markers.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/insert_dispatch_debug_targets.mlir

@@ -4,9 +4,9 @@
 
 // Multiple functions.
 
-// CHECK-LABEL: func.func @target_func
-// ORDINAL_0-LABEL: func.func @target_func
-func.func @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
+// CHECK-LABEL: util.func public @target_func
+// ORDINAL_0-LABEL: util.func public @target_func
+util.func public @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
   %c4 = arith.constant 4 : index
   // CHECK: %[[D0:.+]] = flow.dispatch @dispatch_0::@dispatch_0_entry
   //      ORDINAL_0: flow.tensor.trace "dispatch_0::dispatch_0_entry::0 inputs"
@@ -18,12 +18,12 @@
   %2 = flow.dispatch @dispatch_2::@dispatch_2_entry[%c4] (%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
   // CHECK: %[[EXPORT:.+]] = hal.tensor.export %[[D1]] : tensor<4xf32> -> !hal.buffer_view
-  // CHECK: return %[[EXPORT]] : !hal.buffer_view
-  return %3 : !hal.buffer_view
+  // CHECK: util.return %[[EXPORT]] : !hal.buffer_view
+  util.return %3 : !hal.buffer_view
 }
 
-// CHECK-LABEL: func.func @other_func
-func.func @other_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
+// CHECK-LABEL: util.func public @other_func
+util.func public @other_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
   %c4 = arith.constant 4 : index
   // CHECK: %[[D3:.+]] = flow.dispatch @dispatch_3::@dispatch_3_entry
   %0 = flow.dispatch @dispatch_3::@dispatch_3_entry[%c4] (%arg0) : (tensor<4xf32>) -> tensor<4xf32>
@@ -38,17 +38,17 @@
   %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
 
   // Only break on the symbol as the ordinal specifies a different function.
-  // SYMBOL:  return %[[BREAK_EXPORT]] : !hal.buffer_view
-  // ORDINAL: return %[[ORIGINAL_EXPORT]] : !hal.buffer_view
-  return %3 : !hal.buffer_view
+  // SYMBOL:  util.return %[[BREAK_EXPORT]] : !hal.buffer_view
+  // ORDINAL: util.return %[[ORIGINAL_EXPORT]] : !hal.buffer_view
+  util.return %3 : !hal.buffer_view
 }
 
 // -----
 
 // Break on a dispatch with a different number of results.
 
-// CHECK-LABEL: func.func @target_func
-func.func @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
+// CHECK-LABEL: util.func public @target_func
+util.func public @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
   %c4 = arith.constant 4 : index
   // CHECK: %[[D0:.+]] = flow.dispatch @dispatch_0::@dispatch_0_entry
   %0 = flow.dispatch @dispatch_0::@dispatch_0_entry[%c4] (%arg0) : (tensor<4xf32>) -> tensor<4xf32>
@@ -58,32 +58,32 @@
   %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
   // CHECK: %[[EXPORT_0:.+]] = hal.tensor.export %[[D1]]#0 : tensor<4xf32> -> !hal.buffer_view
   // CHECK: %[[EXPORT_1:.+]] = hal.tensor.export %[[D1]]#1 : tensor<4xf32> -> !hal.buffer_view
-  // CHECK: return %[[EXPORT_0]], %[[EXPORT_1]] : !hal.buffer_view
-  return %3 : !hal.buffer_view
+  // CHECK: util.return %[[EXPORT_0]], %[[EXPORT_1]] : !hal.buffer_view
+  util.return %3 : !hal.buffer_view
 }
 
 // -----
 
 // Break/trace on a dispatch not found in the target function should do nothing.
 
-// CHECK-LABEL: func.func @target_func
-func.func @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
+// CHECK-LABEL: util.func public @target_func
+util.func public @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
   %c4 = arith.constant 4 : index
   // CHECK: %[[D0:.+]] = flow.dispatch @dispatch_0::@dispatch_0_entry
   %0 = flow.dispatch @dispatch_0::@dispatch_0_entry[%c4] (%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   // CHECK: %[[D1:.+]] = hal.tensor.export %[[D0]] : tensor<4xf32> -> !hal.buffer_view
   %1 = hal.tensor.export %0 : tensor<4xf32> -> !hal.buffer_view
-  // CHECK: return %[[D1]] : !hal.buffer_view
-  return %1 : !hal.buffer_view
+  // CHECK: util.return %[[D1]] : !hal.buffer_view
+  util.return %1 : !hal.buffer_view
 }
 
 // -----
 
 // Combines tracing and breaking on the same dispatch.
 
-// CHECK-LABEL: func.func @target_func
+// CHECK-LABEL: util.func public @target_func
 // CHECK-SAME:       %[[ARG0:.+]]: tensor<4xf32>
-func.func @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
+util.func public @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
   %c4 = arith.constant 4 : index
   // CHECK: %[[D0:.+]] = flow.dispatch @dispatch_0::@dispatch_0_entry
   %0 = flow.dispatch @dispatch_0::@dispatch_0_entry[%c4] (%arg0) : (tensor<4xf32>) -> tensor<4xf32>
@@ -98,8 +98,8 @@
   %2 = flow.dispatch @dispatch_2::@dispatch_2_entry[%c4] (%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   %3 = hal.tensor.export %2 : tensor<4xf32> -> !hal.buffer_view
   // CHECK: %[[EXPORT:.+]] = hal.tensor.export %[[D1]] : tensor<4xf32> -> !hal.buffer_view
-  // CHECK: return %[[EXPORT]] : !hal.buffer_view
-  return %3 : !hal.buffer_view
+  // CHECK: util.return %[[EXPORT]] : !hal.buffer_view
+  util.return %3 : !hal.buffer_view
 }
 
 
@@ -107,8 +107,8 @@
 
 // Checks regex matching on a dispatch symbol.
 
-// CHECK-LABEL: func.func @target_func
-func.func @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
+// CHECK-LABEL: util.func public @target_func
+util.func public @target_func(%arg0: tensor<4xf32>) -> !hal.buffer_view {
   %c4 = arith.constant 4 : index
 
   // SYMBOL: flow.tensor.trace "dispatch_1::dispatch_1_entry inputs"
@@ -122,5 +122,5 @@
   // SYMBOL-NOT: flow.tensor.trace "dispatch_11::dispatch_11_entry outputs"
 
   %2 = hal.tensor.export %1 : tensor<4xf32> -> !hal.buffer_view
-  return %2 : !hal.buffer_view
+  util.return %2 : !hal.buffer_view
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_generic_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_generic_ops.mlir
index 87d20ec..73f336f 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_generic_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_generic_ops.mlir

@@ -3,10 +3,10 @@
 // CHECK: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 // CHECK: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1)>
 // CHECK: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d2, d0, d1)>
-//      CHECK: func.func @interchange
+//      CHECK: util.func public @interchange
 //      CHECK:   linalg.generic {indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
 // CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-func.func @interchange(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>) {
+util.func public @interchange(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>, %arg2: tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>) {
   %0 = linalg.generic {indexing_maps = [
     affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>,
     affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
@@ -19,5 +19,5 @@
     %a = arith.addf %arg5, %m : f32
     linalg.yield %a : f32
   } -> tensor<?x?x?xf32>
-  return %0 : tensor<?x?x?xf32>
+  util.return %0 : tensor<?x?x?xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_transpose_generic_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_transpose_generic_ops.mlir
index e88d704..e4a86b8 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_transpose_generic_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_transpose_generic_ops.mlir

@@ -1,6 +1,6 @@
 // RUN: iree-opt --split-input-file --verify-diagnostics --iree-flow-interchange-transpose-generic-ops --canonicalize -cse %s | FileCheck %s
 
-func.func @batch_matmul_transpose(%a: tensor<4x384x384xf32>, %b: tensor<4x384x32xf32>) -> tensor<384x4x32xf32> {
+util.func public @batch_matmul_transpose(%a: tensor<4x384x384xf32>, %b: tensor<4x384x32xf32>) -> tensor<384x4x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<4x384x32xf32>
   %c = linalg.fill ins(%cst : f32) outs(%init : tensor<4x384x32xf32>) -> tensor<4x384x32xf32>
@@ -10,14 +10,14 @@
   ^bb0(%arg0: f32, %arg1: f32):
     linalg.yield %arg0 : f32
   } -> tensor<384x4x32xf32>
-  return %transpose : tensor<384x4x32xf32>
+  util.return %transpose : tensor<384x4x32xf32>
 }
 
 // Check that linalg.generic's input and output indexing maps are exchanged.
 
 //      CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //      CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d0, d2)>
-// CHECK-LABEL: func.func @batch_matmul_transpose
+// CHECK-LABEL: util.func public @batch_matmul_transpose
 //      CHECK:   %[[MATMUL:.+]] = linalg.batch_matmul
 //      CHECK:   linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[$MAP0]], #[[$MAP1]]]
@@ -25,7 +25,7 @@
 
 // -----
 
-func.func @matmul_transpose(%a: tensor<128x384xf32>, %b: tensor<384x384xf32>) -> tensor<384x128xf32> {
+util.func public @matmul_transpose(%a: tensor<128x384xf32>, %b: tensor<384x384xf32>) -> tensor<384x128xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %cst1 = arith.constant 1.000000e+00 : f32
   %init = tensor.empty() : tensor<128x384xf32>
@@ -37,14 +37,14 @@
     %add = arith.addf %arg0, %cst1 : f32
     linalg.yield %add : f32
   } -> tensor<384x128xf32>
-  return %transpose : tensor<384x128xf32>
+  util.return %transpose : tensor<384x128xf32>
 }
 
 // Check that linalg.generic's input and output indexing maps are exchanged.
 
 //      CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 //      CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK-LABEL: func.func @matmul_transpose
+// CHECK-LABEL: util.func public @matmul_transpose
 //      CHECK:   %[[MATMUL:.+]] = linalg.matmul
 //      CHECK:   linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[$MAP0]], #[[$MAP1]]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_externs.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_externs.mlir
index 7571eb3..70bb373 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_externs.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_externs.mlir

@@ -21,8 +21,8 @@
 // Demonstrates the full functionality of an extern dispatch op.
 // Note that some fields are optional.
 
-// CHECK-LABEL: func.func @dispatchExtern
-func.func @dispatchExtern(%arg0: tensor<4xi32>, %arg1: tensor<8xi32>, %arg2: i32) -> tensor<8xi32> {
+// CHECK-LABEL: util.func public @dispatchExtern
+util.func public @dispatchExtern(%arg0: tensor<4xi32>, %arg1: tensor<8xi32>, %arg2: i32) -> tensor<8xi32> {
   %x = arith.constant 100 : index
   %y = arith.constant 50 : index
   // Dispatch workgroups to the externally defined function "main" in the
@@ -60,6 +60,6 @@
         hal.return %ok : i1
       } ordinal(200) = [#hal.executable.object<{path = "b.o"}>]
     })
-  // CHECK: return %[[RESULT]]
-  return %result : tensor<8xi32>
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : tensor<8xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir
index 67e90e1..0a0f9e5 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/outline_dispatch_regions.mlir

@@ -11,9 +11,9 @@
 // CHECK-NEXT:   return
 // CHECK-NEXT: }
 
-// CHECK-LABEL: func.func @staticShapeDispatch(
+// CHECK-LABEL: util.func public @staticShapeDispatch(
 // CHECK-SAME: %[[ARG0:.+]]: tensor<8x4xf32>)
-func.func @staticShapeDispatch(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
+util.func public @staticShapeDispatch(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
   // CHECK-DAG: %[[X:.+]] = arith.constant 100
   %x = arith.constant 100 : index
   // CHECK-DAG: %[[Y:.+]] = arith.constant 50
@@ -29,8 +29,8 @@
     flow.dispatch.tensor.store %ret_value, %ret,  offsets=[0, 0], sizes=[4, 8], strides=[1, 1] : tensor<4x8xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x8xf32>>
     flow.return
   }
-  // CHECK-NEXT: return %[[RET]]
-  return %0 : tensor<4x8xf32>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %0 : tensor<4x8xf32>
 }
 
 // -----
@@ -43,9 +43,9 @@
 // CHECK-NEXT:   flow.executable.export public @dispatchFnMuli_dispatch_1
 //      CHECK: func.func @dispatchFnMuli_dispatch_1(
 
-// CHECK-LABEL: func.func @dispatchFnMuli(
+// CHECK-LABEL: util.func public @dispatchFnMuli(
 // CHECK-SAME: %[[ARG0:.+]]: tensor<8x4xf32>)
-func.func @dispatchFnMuli(%arg0 : tensor<8x4xf32>) -> tensor<8x4xf32> {
+util.func public @dispatchFnMuli(%arg0 : tensor<8x4xf32>) -> tensor<8x4xf32> {
   // CHECK-DAG: %[[X:.+]] = arith.constant 100
   %x = arith.constant 100 : index
   // CHECK-DAG: %[[Y:.+]] = arith.constant 50
@@ -72,16 +72,16 @@
     flow.dispatch.tensor.store %ret_value, %ret, offsets=[0, 0], sizes=[8, 4], strides=[1, 1] : tensor<8x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<8x4xf32>>
     flow.return
   }
-  // CHECK-NEXT: return %[[RET1]]
-  return %1 : tensor<8x4xf32>
+  // CHECK-NEXT: util.return %[[RET1]]
+  util.return %1 : tensor<8x4xf32>
 }
 
 // -----
 
 // CHECK: flow.executable private @dispatchFn1_dispatch_0
 
-// CHECK-LABEL: func.func @dispatchFn1
-func.func @dispatchFn1(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
+// CHECK-LABEL: util.func public @dispatchFn1
+util.func public @dispatchFn1(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
   %x = arith.constant 100 : index
   %y = arith.constant 50 : index
   // CHECK: flow.dispatch @dispatchFn1_dispatch_0::@dispatchFn1_dispatch_0
@@ -93,13 +93,13 @@
   ) {
     flow.return
   }
-  return %0 : tensor<4x8xf32>
+  util.return %0 : tensor<4x8xf32>
 }
 
 // CHECK: flow.executable private @dispatchFn2_dispatch_0
 
-// CHECK-LABEL: func.func @dispatchFn2
-func.func @dispatchFn2(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
+// CHECK-LABEL: util.func public @dispatchFn2
+util.func public @dispatchFn2(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
   %x = arith.constant 100 : index
   %y = arith.constant 50 : index
   // CHECK: flow.dispatch @dispatchFn2_dispatch_0::@dispatchFn2_dispatch_0
@@ -111,7 +111,7 @@
   ) {
     flow.return
   }
-  return %0 : tensor<4x8xf32>
+  util.return %0 : tensor<4x8xf32>
 }
 
 // -----
@@ -130,9 +130,9 @@
 // CHECK:   return
 // CHECK-NEXT: }
 
-// CHECK-LABEL: func.func @dynamicShapeDispatch(
+// CHECK-LABEL: util.func public @dynamicShapeDispatch(
 // CHECK-SAME: %[[ARG0:.+]]: tensor<7x?x24x?xf32>
-func.func @dynamicShapeDispatch(%arg0 : tensor<7x?x24x?xf32>) -> tensor<?x?x1024xf32> {
+util.func public @dynamicShapeDispatch(%arg0 : tensor<7x?x24x?xf32>) -> tensor<?x?x1024xf32> {
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
   // CHECK-DAG: %[[DIM1:.+]] = tensor.dim %[[ARG0]], %c1
@@ -157,14 +157,14 @@
     flow.dispatch.tensor.store %ret_tile, %ret, offsets=[0, 0, 0], sizes=[%dim3_capture, %dim1_capture, 1024], strides=[1, 1, 1] : tensor<?x?x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x1024xf32>>{%dim3_capture, %dim1_capture}
     flow.return
   }
-  // CHECK-NEXT: return %[[RET0]]
-  return %ret0 : tensor<?x?x1024xf32>
+  // CHECK-NEXT: util.return %[[RET0]]
+  util.return %ret0 : tensor<?x?x1024xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @dispatchWithCountRegion
-func.func @dispatchWithCountRegion(%arg0: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK-LABEL: util.func public @dispatchWithCountRegion
+util.func public @dispatchWithCountRegion(%arg0: tensor<4xi32>) -> tensor<4xi32> {
   %x = arith.constant 100 : index
   %y = arith.constant 50 : index
   %0 = flow.dispatch.workgroups[%x, %y](%arg0) : (tensor<4xi32>) -> %arg0 =
@@ -174,5 +174,5 @@
     %z = arith.constant 1 : index
     flow.return %x_capture, %y_capture, %z : index, index, index
   }
-  return %0 : tensor<4xi32>
+  util.return %0 : tensor<4xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pad_fusion_with_consumer.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pad_fusion_with_consumer.mlir
index bfc5809..2945fca 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pad_fusion_with_consumer.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pad_fusion_with_consumer.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{fuse-pad-with-consumers}))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-flow-form-dispatch-regions{fuse-pad-with-consumers}))" --split-input-file %s | FileCheck %s
 
-func.func @fuse_with_consumer(%arg0 : tensor<?x?x?x?xf32>, %arg1 : index,
+util.func public @fuse_with_consumer(%arg0 : tensor<?x?x?x?xf32>, %arg1 : index,
     %arg2 : index, %arg3 : index, %arg4 : index,
     %arg5 : tensor<?x?x?x?xf32>, %arg6 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
   %cst = arith.constant 42.0 : f32
@@ -10,9 +10,9 @@
   } : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
   %1 = linalg.conv_2d_nhwc_hwcf ins(%0, %arg5 : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
       outs(%arg6 : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
-  return %1 : tensor<?x?x?x?xf32>
+  util.return %1 : tensor<?x?x?x?xf32>
 }
-// CHECK-LABEL: func @fuse_with_consumer
+// CHECK-LABEL: util.func public @fuse_with_consumer
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
 //  CHECK-SAME:   %[[ARG5:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
 //  CHECK-SAME:   %[[ARG6:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
@@ -22,4 +22,4 @@
 //  CHECK-SAME:         ins(%[[PADDED]], %[[ARG5]] :
 //  CHECK-SAME:         outs(%[[ARG6]] :
 //       CHECK:     flow.return %[[CONV]]
-//       CHECK:   return %[[RETURN]]
+//       CHECK:   util.return %[[RETURN]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pad_fusion_with_producer.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pad_fusion_with_producer.mlir
index c54e6ce..541a142 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pad_fusion_with_producer.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pad_fusion_with_producer.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-flow-form-dispatch-regions{fuse-pad-with-producers}))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-flow-form-dispatch-regions{fuse-pad-with-producers}))" --split-input-file %s | FileCheck %s
 
-func.func @fuse_pad_with_producer(%arg0 : tensor<?x?x?x?xf32>,
+util.func public @fuse_pad_with_producer(%arg0 : tensor<?x?x?x?xf32>,
     %arg1 : tensor<?x?x?x?xf32>, %arg2 : tensor<?x?x?x?xf32>,
     %arg3 : tensor<?xf32>, %arg4 : index, %arg5 : index, %arg6 : index,
     %arg7 : index) -> tensor<?x?x?x?xf32> {
@@ -33,9 +33,9 @@
   ^bb0(%b0 : index, %b1 : index, %b2 : index, %b3 : index) :
     tensor.yield %cst : f32
   } : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
-  return %4 : tensor<?x?x?x?xf32>
+  util.return %4 : tensor<?x?x?x?xf32>
 }
-// CHECK-LABEL: func @fuse_pad_with_producer(
+// CHECK-LABEL: util.func public @fuse_pad_with_producer(
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
 //  CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
 //  CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<?x?x?x?xf32>
@@ -48,4 +48,4 @@
 //  CHECK-SAME:         ins(%[[CONV]], %[[ARG3]]
 //       CHECK:     %[[PADDED:.+]] = tensor.pad %[[GENERIC]]
 //       CHECK:     flow.return %[[PADDED]]
-//       CHEKC:   return %[[RETURN]]
+//       CHEKC:   util.return %[[RETURN]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pipeline_tests.mlir
index fcb9309..5485f59 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/pipeline_tests.mlir

@@ -6,7 +6,7 @@
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1) -> ()>
 module {
-  func.func @main(%arg0: tensor<833xi32>, %arg1: tensor<833x833xf32>, %arg2: tensor<f32>) -> tensor<f32> {
+  util.func public @main(%arg0: tensor<833xi32>, %arg1: tensor<833x833xf32>, %arg2: tensor<f32>) -> tensor<f32> {
     %cst = arith.constant 5.66893432E-4 : f32
     %0 = tensor.empty() : tensor<833x833xf32>
     %1 = linalg.generic {
@@ -35,7 +35,7 @@
         %10 = arith.addf %b1, %b0 : f32
         linalg.yield %10 : f32
       } -> tensor<f32>
-    return %9 : tensor<f32>
+    util.return %9 : tensor<f32>
   }
 }
 // Check that the linalg op with two reduction loops get folded into a single reduction
@@ -49,11 +49,11 @@
 //       CHECK:     func.func @[[FUNC1:[a-zA-Z0-9_x]+]]
 //       CHECK:       linalg.generic
 //  CHECK-SAME:         ["reduction"]
-//       CHECK:   func.func @main(
+//       CHECK:   util.func public @main(
 //       CHECK:     %[[T0:.+]] = flow.dispatch @[[EXECUTABLE0]]::@[[FUNC0]]
 //       CHECK:     %[[T1:.+]] = flow.tensor.reshape %[[T0]] : tensor<833x833xf32> -> tensor<693889xf32>
 //       CHECK:     %[[T2:.+]] = flow.dispatch @[[EXECUTABLE1]]::@[[FUNC1]](%[[T1]])
-//       CHECK:     return %[[T2]]
+//       CHECK:     util.return %[[T2]]
 
 // -----
 
@@ -63,7 +63,7 @@
 #map3 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>
 #map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
 module {
-  func.func @grouped_quantized_matmul(%arg0: tensor<4096x32x128xi4>, %arg1: tensor<1x1x32x128xf32>, %arg2: tensor<4096x32x1xf32>, %arg3: tensor<4096x32x1xf32>) -> tensor<1x1x4096xf32> {
+  util.func public @grouped_quantized_matmul(%arg0: tensor<4096x32x128xi4>, %arg1: tensor<1x1x32x128xf32>, %arg2: tensor<4096x32x1xf32>, %arg3: tensor<4096x32x1xf32>) -> tensor<1x1x4096xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<1x1x4096xf32>
     %1 = tensor.empty() : tensor<4096x32x128xf32>
@@ -82,7 +82,7 @@
       %6 = arith.addf %5, %out : f32
       linalg.yield %6 : f32
     } -> tensor<1x1x4096xf32>
-    return %4 : tensor<1x1x4096xf32>
+    util.return %4 : tensor<1x1x4096xf32>
   }
 }
 // Check that the two linalg.generic ops are fused into the same dispatch
@@ -102,7 +102,7 @@
 //       CHECK:   arith.mulf
 //       CHECK:   arith.addf
 //       CHECK:   flow.dispatch.tensor.store %[[GEN1]]
-//       CHECK:   func.func @grouped_quantized_matmul(
+//       CHECK:   util.func public @grouped_quantized_matmul(
 //       CHECK:     %[[T0:.+]] = flow.dispatch @[[EXECUTABLE0]]::@[[FUNC0]]
 //       CHECK:     %[[RS:.+]] = flow.tensor.reshape %[[T0]] : tensor<4096xf32> -> tensor<1x1x4096xf32>
-//       CHECK:     return %[[RS]]
+//       CHECK:     util.return %[[RS]]

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/tensor_pad_to_tensor_insert_slice.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/tensor_pad_to_tensor_insert_slice.mlir
index 4ca0ca7..a23c475 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/tensor_pad_to_tensor_insert_slice.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/tensor_pad_to_tensor_insert_slice.mlir

@@ -1,22 +1,20 @@
 // RUN: iree-opt --split-input-file --iree-flow-tensor-pad-to-tensor-insert-slice --canonicalize %s | FileCheck %s
 // RUN: iree-opt --split-input-file --iree-flow-tensor-pad-to-tensor-insert-slice=skip-one-linalg-use-case --canonicalize %s | FileCheck %s --check-prefix=SKIP
 
-module  {
-  func.func @tensor_pad(%arg0 : tensor<?x?xf32>, %arg1 : tensor<f32>, %arg2 : index, %arg3 : index) -> tensor<?x?xf32> {
-    %c0 = arith.constant 0 : index
-    %c4 = arith.constant 4 : index
-    %c3 = arith.constant 3 : index
-    %0 = tensor.extract %arg1[] : tensor<f32>
-    %1 = tensor.pad %arg0 low[%c4, %arg2] high[%arg3, %c3]  {
-    ^bb0(%arg4: index, %arg5: index):
-      tensor.yield %0 : f32
-    } : tensor<?x?xf32> to tensor<?x?xf32>
-    return %1 : tensor<?x?xf32>
-  }
+util.func public @tensor_pad(%arg0 : tensor<?x?xf32>, %arg1 : tensor<f32>, %arg2 : index, %arg3 : index) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c3 = arith.constant 3 : index
+  %0 = tensor.extract %arg1[] : tensor<f32>
+  %1 = tensor.pad %arg0 low[%c4, %arg2] high[%arg3, %c3]  {
+  ^bb0(%arg4: index, %arg5: index):
+    tensor.yield %0 : f32
+  } : tensor<?x?xf32> to tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
 //   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 4)>
 //   CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 3)>
-//       CHECK: func.func @tensor_pad
+//       CHECK: util.func public @tensor_pad
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<f32>
 //  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]+]]: index
@@ -35,25 +33,23 @@
 //   CHECK-DAG:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]]
 //   CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]]
 //       CHECK:   %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[FILL]][4, %[[ARG2]]] [%[[D0]], %[[D1]]] [1, 1]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-module  {
-  func.func @tensor_pad_static(%arg0: tensor<12x4xf32>, %arg1: tensor<f32>) -> tensor<18x12xf32> {
-    %c4 = arith.constant 4 : index
-    %c2 = arith.constant 2 : index
-    %c5 = arith.constant 5 : index
-    %c3 = arith.constant 3 : index
-    %0 = tensor.extract %arg1[] : tensor<f32>
-    %1 = tensor.pad %arg0 low[%c4, %c5] high[%c2, %c3]  {
-    ^bb0(%arg2: index, %arg3: index):
-      tensor.yield %0 : f32
-    } : tensor<12x4xf32> to tensor<18x12xf32>
-    return %1 : tensor<18x12xf32>
-  }
+util.func public @tensor_pad_static(%arg0: tensor<12x4xf32>, %arg1: tensor<f32>) -> tensor<18x12xf32> {
+  %c4 = arith.constant 4 : index
+  %c2 = arith.constant 2 : index
+  %c5 = arith.constant 5 : index
+  %c3 = arith.constant 3 : index
+  %0 = tensor.extract %arg1[] : tensor<f32>
+  %1 = tensor.pad %arg0 low[%c4, %c5] high[%c2, %c3]  {
+  ^bb0(%arg2: index, %arg3: index):
+    tensor.yield %0 : f32
+  } : tensor<12x4xf32> to tensor<18x12xf32>
+  util.return %1 : tensor<18x12xf32>
 }
-// CHECK-LABEL: func.func @tensor_pad_static
+// CHECK-LABEL: util.func public @tensor_pad_static
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<12x4xf32>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: tensor<f32>
 //   CHECK-DAG:   %[[VAL:.+]] = tensor.extract %[[ARG1]]
@@ -62,11 +58,11 @@
 //  CHECK-SAME:       ins(%[[VAL]] :
 //  CHECK-SAME:       outs(%[[INIT]] :
 //       CHECK:   %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[FILL]][4, 5] [12, 4] [1, 1]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @_main(%arg0: tensor<1x33x33x480xf32>, %arg1: tensor<3x3x480x1xf32>) -> tensor<1x33x33x480xf32> {
+util.func public @_main(%arg0: tensor<1x33x33x480xf32>, %arg1: tensor<3x3x480x1xf32>) -> tensor<1x33x33x480xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.pad %arg0 low[0, 4, 4, 0] high[0, 4, 4, 0] {
   ^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
@@ -76,14 +72,14 @@
   %2 = tensor.collapse_shape %arg1 [[0], [1], [2, 3]] : tensor<3x3x480x1xf32> into tensor<3x3x480xf32>
   %3 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x33x33x480xf32>) -> tensor<1x33x33x480xf32>
   %4 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<4> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%0, %2 : tensor<1x41x41x480xf32>, tensor<3x3x480xf32>) outs(%3 : tensor<1x33x33x480xf32>) -> tensor<1x33x33x480xf32>
-  return %4 : tensor<1x33x33x480xf32>
+  util.return %4 : tensor<1x33x33x480xf32>
 }
 // CHECK-NOT: tensor.pad
 // SKIP: tensor.pad
 
 // ----
 
-func.func @dispatch_dispatch_0_generic_512x1024_f32(
+util.func public @dispatch_dispatch_0_generic_512x1024_f32(
     %arg0: !flow.dispatch.tensor<readonly:tensor<512x1024xf32>>,
     %arg1: index, %arg2: index, %arg3: index, %arg4: index,
     %arg5: !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<512x1024xf32>>>>) {
@@ -100,7 +96,7 @@
   } : tensor<512x1024xf32> to tensor<?x?xf32>
   %11 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<512x1024xf32>>>
   flow.dispatch.tensor.store %11, %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<512x1024xf32>>> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<512x1024xf32>>>>{%0, %1}
-  return
+  util.return
 }
 
 // CHECK:  %[[LOAD:.+]] = flow.dispatch.tensor.load

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/top_level_scf_to_cfg.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/top_level_scf_to_cfg.mlir
index 6b4987e..12503dd 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/top_level_scf_to_cfg.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/top_level_scf_to_cfg.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-top-level-scf-to-cfg))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(util.func(iree-top-level-scf-to-cfg))" %s | FileCheck %s
 
 // CHECK-LABEL: @generic_nested_for
 // While not super recommended, we do have cases of SCF constructs embedded
@@ -6,7 +6,7 @@
 // The normal --convert-scf-to-std pass will produce an illegal linalg op
 // (multiple basic blocks). The --iree-top-level-scf-to-cfg should not touch it.
 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-func.func @generic_nested_for(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?x?x?x?xi32>, %out0: tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32> {
+util.func public @generic_nested_for(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?x?x?x?xi32>, %out0: tensor<?x?x?x?xi32>) -> tensor<?x?x?x?xi32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c6 = arith.constant 6 : index
@@ -41,5 +41,5 @@
     linalg.yield %27 : i32
   } -> tensor<?x?x?x?xi32>
 
-  return %0 : tensor<?x?x?x?xi32>
+  util.return %0 : tensor<?x?x?x?xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
index af63269..8610b28 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transform_dispatch_region_formation.mlir

@@ -1,16 +1,16 @@
 // RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule -allow-unregistered-dialect -split-input-file | FileCheck %s
 
-// CHECK-LABEL: func @single_op(
+// CHECK-LABEL: util.func public @single_op(
 //  CHECK-SAME:   %[[arg0:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
-func.func @single_op(%arg0: tensor<?x?xf32>, %s1: index, %s2: index) -> tensor<?x?xf32> {
+util.func public @single_op(%arg0: tensor<?x?xf32>, %s1: index, %s2: index) -> tensor<?x?xf32> {
   // CHECK: %[[region:.*]] = flow.dispatch.region -> (tensor<?x?xf32>{%[[s1]], %[[s2]]}) {
   // CHECK:   %[[slice:.*]] = tensor.extract_slice %[[arg0]]
   // CHECK:   flow.return %[[slice]]
   // CHECK: }
-  // CHECK: return %[[region]]
+  // CHECK: util.return %[[region]]
   %0 = tensor.extract_slice %arg0 [0, 10] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> to tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
 
 module attributes { transform.with_named_sequence } {
@@ -23,9 +23,9 @@
 
 // -----
 
-// CHECK-LABEL: func @clone_preceding(
+// CHECK-LABEL: util.func public @clone_preceding(
 //  CHECK-SAME:   %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
-func.func @clone_preceding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+util.func public @clone_preceding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
   // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
   // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
   // CHECK-DAG: %[[dim0:.*]] = tensor.dim %[[arg1]], %[[c0]]
@@ -36,11 +36,11 @@
   // CHECK:   %[[insert:.*]] = tensor.insert_slice %[[dummy_clone]] into %[[arg1]]
   // CHECK:   flow.return %[[insert]]
   // CHECK: }
-  // CHECK: return %[[dummy]], %[[region]]
+  // CHECK: util.return %[[dummy]], %[[region]]
   %0 = "test.dummy"() : () -> (tensor<?x?xf32>)
   %1 = tensor.insert_slice %0 into %arg1 [5, 16] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> into tensor<?x?xf32>
-  return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32>
+  util.return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
 module attributes { transform.with_named_sequence } {
@@ -55,9 +55,9 @@
 
 // -----
 
-// CHECK-LABEL: func @move_preceding(
+// CHECK-LABEL: util.func public @move_preceding(
 //  CHECK-SAME:   %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
-func.func @move_preceding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+util.func public @move_preceding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
   // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
   // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
   // CHECK-DAG: %[[dim0:.*]] = tensor.dim %[[arg1]], %[[c0]]
@@ -67,12 +67,12 @@
   // CHECK:   %[[insert:.*]] = tensor.insert_slice %[[slice]] into %[[arg1]]
   // CHECK:   flow.return %[[insert]], %[[slice]]
   // CHECK: }
-  // CHECK: return %[[region]]#0, %[[region]]#1
+  // CHECK: util.return %[[region]]#0, %[[region]]#1
   %0 = tensor.extract_slice %arg0 [0, 10] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> to tensor<?x?xf32>
   %1 = tensor.insert_slice %0 into %arg1 [5, 16] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> into tensor<?x?xf32>
-  return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
+  util.return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
 module attributes { transform.with_named_sequence } {
@@ -87,18 +87,18 @@
 
 // -----
 
-// CHECK-LABEL: func @create_region_and_convert_to_workgroups
+// CHECK-LABEL: util.func public @create_region_and_convert_to_workgroups
 //       CHECK:   tensor.empty()
 //       CHECK:   flow.dispatch.workgroups
 //       CHECK:     linalg.matmul
 //       CHECK:     flow.return
-func.func @create_region_and_convert_to_workgroups(
+util.func public @create_region_and_convert_to_workgroups(
     %A: tensor<5x3xf32>, %B: tensor<3x5xf32>) -> tensor<5x5xf32> {
   %init = tensor.empty() : tensor<5x5xf32>
   %matmul = linalg.matmul
       ins(%A, %B : tensor<5x3xf32>, tensor<3x5xf32>)
       outs(%init : tensor<5x5xf32>) -> tensor<5x5xf32>
-  return %matmul : tensor<5x5xf32>
+  util.return %matmul : tensor<5x5xf32>
 }
 
 module attributes { transform.with_named_sequence } {
@@ -112,7 +112,7 @@
 
 // -----
 
-// CHECK-LABEL: func @clone_multiple_preceding
+// CHECK-LABEL: util.func public @clone_multiple_preceding
 //   CHECK-DAG:   arith.constant
 //   CHECK-DAG:   arith.constant
 //   CHECK-DAG:   tensor.dim
@@ -123,7 +123,7 @@
 //  CHECK-NEXT:     "test.second_user"
 //  CHECK-NEXT:     "test.merge1"
 //  CHECK-NEXT:     "test.merge2"
-func.func @clone_multiple_preceding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>) {
+util.func public @clone_multiple_preceding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>) {
   %0 = "test.dummy_op"(%arg0) {__tagged__} : (tensor<?x?xf32>) -> (tensor<?x?xf32>)
   %1 = "test.first_user"(%0) {__tagged__} : (tensor<?x?xf32>) -> (tensor<?x?xf32>)
   %2 = "test.second_user"(%0) {__tagged__} : (tensor<?x?xf32>) -> (tensor<?x?xf32>)
@@ -132,7 +132,7 @@
   %4 = "test.merge2"(%1, %3) {__tagged__} : (tensor<?x?xf32>, tensor<?x?xf32>) -> (tensor<?x?xf32>)
   %5 = tensor.insert_slice %4 into %arg1 [5, 16] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> into tensor<?x?xf32>
-  return %5 : tensor<?x?xf32>
+  util.return %5 : tensor<?x?xf32>
 }
 
 module attributes { transform.with_named_sequence } {
@@ -147,9 +147,9 @@
 
 // -----
 
-// CHECK-LABEL: func @move_succeeding(
+// CHECK-LABEL: util.func public @move_succeeding(
 //  CHECK-SAME:   %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
-func.func @move_succeeding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+util.func public @move_succeeding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
   // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
   // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
   // CHECK-DAG: %[[dim0:.*]] = tensor.dim %[[arg1]], %[[c0]]
@@ -159,12 +159,12 @@
   // CHECK:   %[[insert:.*]] = tensor.insert_slice %[[slice]] into %[[arg1]]
   // CHECK:   flow.return %[[slice]], %[[insert]]
   // CHECK: }
-  // CHECK: return %[[region]]#1, %[[region]]#0
+  // CHECK: util.return %[[region]]#1, %[[region]]#0
   %0 = tensor.extract_slice %arg0 [0, 10] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> to tensor<?x?xf32>
   %1 = tensor.insert_slice %0 into %arg1 [5, 16] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> into tensor<?x?xf32>
-  return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
+  util.return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
 module attributes { transform.with_named_sequence } {
@@ -179,7 +179,7 @@
 
 // -----
 
-// CHECK-LABEL: func @move_multiple_succeeding
+// CHECK-LABEL: util.func public @move_multiple_succeeding
 //  CHECK-NEXT:   flow.dispatch.region -> (tensor<50x90xf32>, tensor<50x90xf32>, tensor<50x90xf32>, tensor<50x90xf32>, tensor<50x90xf32>, tensor<600x700xf32>)
 //  CHECK-NEXT:   "test.dummy_op"
 //  CHECK-NEXT:   "test.first_user"
@@ -190,7 +190,7 @@
 //  CHECK-NEXT:   flow.return
 //  CHECK-NEXT: }
 //  CHECK-NEXT: "test.third_user"
-func.func @move_multiple_succeeding(%arg0: tensor<50x90xf32>, %arg1: tensor<600x700xf32>) -> (tensor<600x700xf32>, tensor<50x90xf32>) {
+util.func public @move_multiple_succeeding(%arg0: tensor<50x90xf32>, %arg1: tensor<600x700xf32>) -> (tensor<600x700xf32>, tensor<50x90xf32>) {
   %0 = "test.dummy_op"(%arg0) : (tensor<50x90xf32>) -> (tensor<50x90xf32>)
   %1 = "test.first_user"(%0) {__tagged__} : (tensor<50x90xf32>) -> (tensor<50x90xf32>)
   %2 = "test.second_user"(%0) {__tagged__} : (tensor<50x90xf32>) -> (tensor<50x90xf32>)
@@ -199,7 +199,7 @@
   %4 = "test.merge2"(%1, %3) {__tagged__} : (tensor<50x90xf32>, tensor<50x90xf32>) -> (tensor<50x90xf32>)
   %5 = tensor.insert_slice %4 into %arg1 [5, 16] [50, 90] [1, 1] {__tagged__}
       : tensor<50x90xf32> into tensor<600x700xf32>
-  return %5, %u : tensor<600x700xf32>, tensor<50x90xf32>
+  util.return %5, %u : tensor<600x700xf32>, tensor<50x90xf32>
 }
 
 module attributes { transform.with_named_sequence } {
@@ -214,21 +214,21 @@
 
 // -----
 
-// CHECK-LABEL: func @clone_succeeding(
+// CHECK-LABEL: util.func public @clone_succeeding(
 //  CHECK-SAME:   %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
-func.func @clone_succeeding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+util.func public @clone_succeeding(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %s1: index, %s2: index) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
   // CHECK: %[[region:.*]] = flow.dispatch.region -> (tensor<?x?xf32>{%[[s1]], %[[s2]]}) {
   // CHECK:   %[[slice:.*]] = tensor.extract_slice %[[arg0]]
   // CHECK:   tensor.insert_slice %[[slice]] into %[[arg1]]
   // CHECK:   flow.return %[[slice]]
   // CHECK: }
   // CHECK: %[[insert:.*]] = tensor.insert_slice %[[region]] into %[[arg1]]
-  // CHECK: return %[[insert]], %[[region]]
+  // CHECK: util.return %[[insert]], %[[region]]
   %0 = tensor.extract_slice %arg0 [0, 10] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> to tensor<?x?xf32>
   %1 = tensor.insert_slice %0 into %arg1 [5, 16] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> into tensor<?x?xf32>
-  return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
+  util.return %1, %0 : tensor<?x?xf32>, tensor<?x?xf32>
 }
 
 module attributes { transform.with_named_sequence } {
@@ -245,8 +245,8 @@
 
 // This is a regression for reifyDynamicResultDims.
 
-// CHECK-LABEL: func @reify_result_dims_regression(
-func.func @reify_result_dims_regression(%s1: index, %s2: index) -> (tensor<4x?xf32>) {
+// CHECK-LABEL: util.func public @reify_result_dims_regression(
+util.func public @reify_result_dims_regression(%s1: index, %s2: index) -> (tensor<4x?xf32>) {
   // CHECK: %[[dest:.*]] = "test.dummy_dest"
   // CHECK: %[[c1:.*]] = arith.constant 1 : index
   // CHECK: %[[dim1:.*]] = tensor.dim %[[dest]], %[[c1]]
@@ -255,7 +255,7 @@
   // CHECK:   %[[insert:.*]] = tensor.insert_slice %[[src]] into %[[dest]]
   // CHECK:   flow.return %[[insert]]
   // CHECK: }
-  // CHECK: return %[[region]]
+  // CHECK: util.return %[[region]]
 
   // This op does not implement any interface for querying dynamic result dims.
   // Generate a tensor.dim op.
@@ -263,7 +263,7 @@
   %src = "test.dummy_src"() : () -> (tensor<?x?xf32>)
   %1 = tensor.insert_slice %src into %dest [5, 16] [%s1, %s2] [1, 1]
       : tensor<?x?xf32> into tensor<4x?xf32>
-  return %1 : tensor<4x?xf32>
+  util.return %1 : tensor<4x?xf32>
 }
 
 module attributes { transform.with_named_sequence } {

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/verify_input_ir.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/verify_input_ir.mlir
index 81c8519..f746eab 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/verify_input_ir.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/verify_input_ir.mlir

@@ -1,21 +1,21 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-verify-input-legality))" --verify-diagnostics %s -split-input-file
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-verify-input-legality))" --verify-diagnostics %s -split-input-file
 
 // expected-error@below {{illegal operations still remain}}
-func.func @check_no_stablehlo(%arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @check_no_stablehlo(%arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   // expected-error@+1 {{illegal op still exists}}
   %0 = stablehlo.add %arg0, %arg1 : tensor<?x?xf32>
   // expected-error@+1 {{illegal op still exists}}
   %1 = chlo.broadcast_add %0, %arg1 : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
 
 // -----
 
 // expected-error@below {{illegal operations still remain}}
-func.func @check_no_tosa(%arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @check_no_tosa(%arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   // expected-error@+1 {{illegal op still exists}}
   %0 = tosa.add %arg0, %arg1 : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
 
 // -----
@@ -23,17 +23,17 @@
 // Note: checking that this is illegal even if the op could be folded. This pass
 // shouldn't be modifying the IR.
 // expected-error@below {{illegal operations still remain}}
-func.func @check_no_unrealized_cast(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+util.func public @check_no_unrealized_cast(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // expected-error@+1 {{illegal op still exists}}
   %0 = builtin.unrealized_conversion_cast %arg0 : tensor<?xf32> to memref<?xf32>
   // expected-error@+1 {{illegal op still exists}}
   %1 = builtin.unrealized_conversion_cast %0 : memref<?xf32> to tensor<?xf32>
-  return %1 : tensor<?xf32>
+  util.return %1 : tensor<?xf32>
 }
 
 // -----
 
-func.func @check_linalg_ok(%conv : tensor<1x112x112x16xf32>, %bias : tensor<16xf32>, %init : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> {
+util.func public @check_linalg_ok(%conv : tensor<1x112x112x16xf32>, %bias : tensor<16xf32>, %init : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> {
   %result = linalg.generic {
       indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
                        affine_map<(d0, d1, d2, d3) -> (d3)>,
@@ -45,5 +45,5 @@
         %0 = arith.addf %arg0, %arg1 : f32
         linalg.yield %0 : f32
       } -> tensor<1x112x112x16xf32>
-  return %result : tensor<1x112x112x16xf32>
+  util.return %result : tensor<1x112x112x16xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToHAL/test/pseudo_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToHAL/test/pseudo_ops.mlir
index 4cb30f5..8533479 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToHAL/test/pseudo_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToHAL/test/pseudo_ops.mlir

@@ -22,13 +22,13 @@
 // CHECK-LABEL: @calculateWorkgroups
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device,
 // CHECK-SAME:  %[[WORKLOAD_0:.+]]: index, %[[WORKLOAD_1:.+]]: index, %[[WORKLOAD_2:.+]]: index)
-func.func @calculateWorkgroups(%device: !hal.device, %workload_0: index, %workload_1: index, %workload_2: index) -> (index, index, index) {
+util.func public @calculateWorkgroups(%device: !hal.device, %workload_0: index, %workload_1: index, %workload_2: index) -> (index, index, index) {
   // CHECK-DAG: %[[WORKGROUP_YZ:.+]] = arith.constant 1 : index
   // CHECK-DAG: %[[WORKGROUP_X:.+]] = affine.apply
   %workgroups:3 = hal.executable.calculate_workgroups
       device(%device : !hal.device)
       target(@ex::@variant::@dispatch)
       workload([%workload_0, %workload_1, %workload_2]) : index, index, index
-  // CHECK: return %[[WORKGROUP_X]], %[[WORKGROUP_YZ]], %[[WORKGROUP_YZ]]
-  return %workgroups#0, %workgroups#1, %workgroups#2 : index, index, index
+  // CHECK: util.return %[[WORKGROUP_X]], %[[WORKGROUP_YZ]], %[[WORKGROUP_YZ]]
+  util.return %workgroups#0, %workgroups#1, %workgroups#2 : index, index, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/allocator_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/allocator_ops.mlir
index 3d47ef5..baa017e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/allocator_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/allocator_ops.mlir

@@ -1,20 +1,20 @@
 // RUN: iree-opt --split-input-file --canonicalize --iree-convert-hal-to-vm %s | FileCheck %s
 
 // CHECK-LABEL: vm.func private @allocatorAllocate
-func.func @allocatorAllocate(%arg0 : !hal.allocator) -> !hal.buffer {
+util.func public @allocatorAllocate(%arg0 : !hal.allocator) -> !hal.buffer {
   // CHECK-DAG: %[[SIZE:.+]] = vm.const.i64 1024
   %size = arith.constant 1024 : index
   // CHECK-DAG: %[[AFFINITY:.+]] = vm.const.i64 -1
   %affinity = arith.constant -1 : i64
   // CHECK: %ref = vm.call @hal.allocator.allocate(%arg0, %[[AFFINITY]], %c70, %c3075, %[[SIZE]]) : (!vm.ref<!hal.allocator>, i64, i32, i32, i64) -> !vm.ref<!hal.buffer>
   %0 = hal.allocator.allocate<%arg0 : !hal.allocator> affinity(%affinity) type("HostLocal") usage("DispatchStorage|Transfer") : !hal.buffer{%size}
-  return %0 : !hal.buffer
+  util.return %0 : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: vm.func private @allocatorImport
-func.func @allocatorImport(%arg0 : !hal.allocator, %arg1 : !util.buffer) -> (i1, !hal.buffer) {
+util.func public @allocatorImport(%arg0 : !hal.allocator, %arg1 : !util.buffer) -> (i1, !hal.buffer) {
   // CHECK-DAG: %[[OFFSET:.+]] = vm.const.i64 128
   %offset = arith.constant 128 : index
   // CHECK-DAG: %[[LENGTH:.+]] = vm.const.i64 256
@@ -24,6 +24,6 @@
   // CHECK: %[[IMPORTED:.+]] = vm.call @hal.allocator.import(%arg0, %c1, %[[AFFINITY]], %c6, %c3, %arg1, %[[OFFSET]], %[[LENGTH]]) : (!vm.ref<!hal.allocator>, i32, i64, i32, i32, !vm.buffer, i64, i64) -> !vm.ref<!hal.buffer>
   %did_import, %buffer = hal.allocator.import<%arg0 : !hal.allocator> source(%arg1 : !util.buffer)[%offset, %length] affinity(%affinity) type("HostVisible|HostCoherent") usage("Transfer") : i1, !hal.buffer
   // CHECK: %[[DID_IMPORT:.+]] = vm.cmp.nz.ref %[[IMPORTED]]
-  // CHECK: return %[[DID_IMPORT]], %[[IMPORTED]]
-  return %did_import, %buffer : i1, !hal.buffer
+  // CHECK: vm.return %[[DID_IMPORT]], %[[IMPORTED]]
+  util.return %did_import, %buffer : i1, !hal.buffer
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/buffer_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/buffer_ops.mlir
index 55c7f4c..21c76ab 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/buffer_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/buffer_ops.mlir

@@ -2,56 +2,56 @@
 
 // CHECK-LABEL: @buffer_subspan
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>)
-func.func @buffer_subspan(%buffer : !hal.buffer) -> !hal.buffer {
+util.func public @buffer_subspan(%buffer : !hal.buffer) -> !hal.buffer {
   %c42 = arith.constant 42 : index
   %c43 = arith.constant 43 : index
   // CHECK: %[[RET:.+]] = vm.call @hal.buffer.subspan(%[[BUFFER]], %c42, %c43) {nosideeffects} : (!vm.ref<!hal.buffer>, i64, i64) -> !vm.ref<!hal.buffer>
   %subspan = hal.buffer.subspan<%buffer : !hal.buffer>[%c42, %c43] : !hal.buffer
-  // CHECK: return %[[RET]]
-  return %subspan: !hal.buffer
+  // CHECK: vm.return %[[RET]]
+  util.return %subspan: !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_load_i8
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>)
-func.func @buffer_load_i8(%buffer: !hal.buffer) -> i8 {
+util.func public @buffer_load_i8(%buffer: !hal.buffer) -> i8 {
   %c64 = arith.constant 64 : index
   // CHECK: %[[RET:.+]] = vm.call @hal.buffer.load(%[[BUFFER]], %c64, %c1) : (!vm.ref<!hal.buffer>, i64, i32) -> i32
   %0 = hal.buffer.load<%buffer: !hal.buffer>[%c64] : i8
-  // CHECK: return %[[RET]]
-  return %0 : i8
+  // CHECK: vm.return %[[RET]]
+  util.return %0 : i8
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_load_i16
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>)
-func.func @buffer_load_i16(%buffer: !hal.buffer) -> i16 {
+util.func public @buffer_load_i16(%buffer: !hal.buffer) -> i16 {
   %c64 = arith.constant 64 : index
   // CHECK: %[[RET:.+]] = vm.call @hal.buffer.load(%[[BUFFER]], %c64, %c2) : (!vm.ref<!hal.buffer>, i64, i32) -> i32
   %0 = hal.buffer.load<%buffer: !hal.buffer>[%c64] : i16
-  // CHECK: return %[[RET]]
-  return %0 : i16
+  // CHECK: vm.return %[[RET]]
+  util.return %0 : i16
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_load_i32
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>)
-func.func @buffer_load_i32(%buffer: !hal.buffer) -> i32 {
+util.func public @buffer_load_i32(%buffer: !hal.buffer) -> i32 {
   %c64 = arith.constant 64 : index
   // CHECK: %[[RET:.+]] = vm.call @hal.buffer.load(%[[BUFFER]], %c64, %c4) : (!vm.ref<!hal.buffer>, i64, i32) -> i32
   %0 = hal.buffer.load<%buffer: !hal.buffer>[%c64] : i32
-  // CHECK: return %[[RET]]
-  return %0 : i32
+  // CHECK: vm.return %[[RET]]
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_load_i64
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>)
-func.func @buffer_load_i64(%buffer: !hal.buffer) -> i64 {
+util.func public @buffer_load_i64(%buffer: !hal.buffer) -> i64 {
   %c64 = arith.constant 64 : index
 
   // CHECK-DAG: %[[OFFSET_HI:.+]] = vm.add.i64 %c64, %c4
@@ -65,61 +65,61 @@
   // CHECK: %[[RET:.+]] = vm.or.i64 %[[LO_I64]], %[[HI_I64]]
 
   %0 = hal.buffer.load<%buffer: !hal.buffer>[%c64] : i64
-  // CHECK: return %[[RET]]
-  return %0 : i64
+  // CHECK: vm.return %[[RET]]
+  util.return %0 : i64
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_load_f32
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>)
-func.func @buffer_load_f32(%buffer: !hal.buffer) -> f32 {
+util.func public @buffer_load_f32(%buffer: !hal.buffer) -> f32 {
   %c64 = arith.constant 64 : index
   // CHECK: %[[RET_I32:.+]] = vm.call @hal.buffer.load(%[[BUFFER]], %c64, %c4) : (!vm.ref<!hal.buffer>, i64, i32) -> i32
   %0 = hal.buffer.load<%buffer: !hal.buffer>[%c64] : f32
   // CHECK: %[[RET:.+]] = vm.bitcast.i32.f32 %[[RET_I32]]
-  // CHECK: return %[[RET]]
-  return %0 : f32
+  // CHECK: vm.return %[[RET]]
+  util.return %0 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_store_i8
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>, %[[VALUE:.+]]: i32)
-func.func @buffer_store_i8(%buffer: !hal.buffer, %value: i8) {
+util.func public @buffer_store_i8(%buffer: !hal.buffer, %value: i8) {
   %c64 = arith.constant 64 : index
   // CHECK: vm.call @hal.buffer.store(%[[VALUE]], %[[BUFFER]], %c64, %c1) : (i32, !vm.ref<!hal.buffer>, i64, i32) -> ()
   hal.buffer.store<%buffer : !hal.buffer>[%c64] value(%value : i8)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_store_i16
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>, %[[VALUE:.+]]: i32)
-func.func @buffer_store_i16(%buffer: !hal.buffer, %value: i16) {
+util.func public @buffer_store_i16(%buffer: !hal.buffer, %value: i16) {
   %c64 = arith.constant 64 : index
   // CHECK: vm.call @hal.buffer.store(%[[VALUE]], %[[BUFFER]], %c64, %c2) : (i32, !vm.ref<!hal.buffer>, i64, i32) -> ()
   hal.buffer.store<%buffer : !hal.buffer>[%c64] value(%value : i16)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_store_i32
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>, %[[VALUE:.+]]: i32)
-func.func @buffer_store_i32(%buffer: !hal.buffer, %value: i32) {
+util.func public @buffer_store_i32(%buffer: !hal.buffer, %value: i32) {
   %c64 = arith.constant 64 : index
   // CHECK: vm.call @hal.buffer.store(%[[VALUE]], %[[BUFFER]], %c64, %c4) : (i32, !vm.ref<!hal.buffer>, i64, i32) -> ()
   hal.buffer.store<%buffer : !hal.buffer>[%c64] value(%value : i32)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_store_i64
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>, %[[VALUE:.+]]: i64)
-func.func @buffer_store_i64(%buffer: !hal.buffer, %value: i64) {
+util.func public @buffer_store_i64(%buffer: !hal.buffer, %value: i64) {
   %c64 = arith.constant 64 : index
 
   // CHECK-DAG: %[[VALUE_LO:.+]] = vm.trunc.i64.i32 %[[VALUE]]
@@ -131,17 +131,17 @@
   // CHECK: vm.call @hal.buffer.store(%[[VALUE_HI]], %[[BUFFER]], %[[OFFSET_HI]], %c4) : (i32, !vm.ref<!hal.buffer>, i64, i32) -> ()
 
   hal.buffer.store<%buffer : !hal.buffer>[%c64] value(%value : i64)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_store_f32
 // CHECK-SAME: (%[[BUFFER:.+]]: !vm.ref<!hal.buffer>, %[[VALUE:.+]]: f32)
-func.func @buffer_store_f32(%buffer: !hal.buffer, %value: f32) {
+util.func public @buffer_store_f32(%buffer: !hal.buffer, %value: f32) {
   %c64 = arith.constant 64 : index
   // CHECK: %[[VALUE_I32:.+]] = vm.bitcast.f32.i32 %[[VALUE]]
   // CHECK: vm.call @hal.buffer.store(%[[VALUE_I32]], %[[BUFFER]], %c64, %c4) : (i32, !vm.ref<!hal.buffer>, i64, i32) -> ()
   hal.buffer.store<%buffer : !hal.buffer>[%c64] value(%value : f32)
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/buffer_view_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/buffer_view_ops.mlir
index ff713e6..3d913a1 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/buffer_view_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/buffer_view_ops.mlir

@@ -1,28 +1,28 @@
 // RUN: iree-opt --split-input-file --iree-convert-hal-to-vm --iree-vm-target-index-bits=32 %s | FileCheck %s
 
 // CHECK-LABEL: @element_type
-func.func @element_type() -> i32 {
+util.func public @element_type() -> i32 {
   // CHECK: %[[RET:.+]] = vm.const.i32 553648160
   %element_type = hal.element_type<f32> : i32
-  // CHECK: return %[[RET]]
-  return %element_type : i32
+  // CHECK: vm.return %[[RET]]
+  util.return %element_type : i32
 }
 
 // -----
 
 // CHECK-LABEL: @encoding_type
-func.func @encoding_type() -> i32 {
+util.func public @encoding_type() -> i32 {
   // CHECK: %[[RET:.+]] = vm.const.i32 1
   %encoding_type = hal.encoding_type<dense_row_major> : i32
-  // CHECK: return %[[RET]]
-  return %encoding_type : i32
+  // CHECK: vm.return %[[RET]]
+  util.return %encoding_type : i32
 }
 
 // -----
 
 // CHECK-LABEL: vm.func private @buffer_view_dims
 // CHECK-SAME: %[[VIEW:.+]]: !vm.ref<!hal.buffer_view>
-func.func @buffer_view_dims(%arg0 : !hal.buffer_view) -> (index, index, index) {
+util.func public @buffer_view_dims(%arg0 : !hal.buffer_view) -> (index, index, index) {
   // CHECK-DAG: %[[D0_64:.+]] = vm.call @hal.buffer_view.dim(%[[VIEW]], %zero)
   // CHECK-DAG: %[[D1_64:.+]] = vm.call @hal.buffer_view.dim(%[[VIEW]], %c1)
   // CHECK-DAG: %[[D2_64:.+]] = vm.call @hal.buffer_view.dim(%[[VIEW]], %c2)
@@ -33,5 +33,5 @@
   // CHECK-DAG: %[[D1_32:.+]] = vm.trunc.i64.i32 %[[D1_64]]
   // CHECK-DAG: %[[D2_32:.+]] = vm.trunc.i64.i32 %[[D2_64]]
   // CHECK-NEXT: vm.return %[[D0_32]], %[[D1_32]], %[[D2_32]]
-  return %0, %1, %2 : index, index, index
+  util.return %0, %1, %2 : index, index, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/channel_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/channel_ops.mlir
index 749fafa..ebc0f71 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/channel_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/channel_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @channel_create
 //  CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[AFFINITY:.+]]: i64, %[[ID:.+]]: !vm.buffer, %[[GROUP:.+]]: !vm.buffer, %[[RANK:.+]]: i32, %[[COUNT:.+]]: i32) -> !vm.ref<!hal.channel>
-func.func @channel_create(%device: !hal.device, %affinity: i64, %id: !util.buffer, %group: !util.buffer, %rank: i32, %count: i32) -> !hal.channel {
+util.func public @channel_create(%device: !hal.device, %affinity: i64, %id: !util.buffer, %group: !util.buffer, %rank: i32, %count: i32) -> !hal.channel {
   // CHECK: %[[FLAGS:.+]] = vm.const.i32.zero
   // CHECK: %[[CHANNEL:.+]] = vm.call @hal.channel.create(%[[DEVICE]], %[[AFFINITY]], %[[FLAGS]], %[[ID]], %[[GROUP]], %[[RANK]], %[[COUNT]])
   %channel = hal.channel.create device(%device : !hal.device)
@@ -12,32 +12,32 @@
                                  group(%group)
                                   rank(%rank)
                                  count(%count) : !hal.channel
-  // CHECK: return %[[CHANNEL]]
-  return %channel : !hal.channel
+  // CHECK: vm.return %[[CHANNEL]]
+  util.return %channel : !hal.channel
 }
 
 // -----
 
 // CHECK-LABEL: @channel_split
 //  CHECK-SAME: (%[[BASE_CHANNEL:.+]]: !vm.ref<!hal.channel>, %[[COLOR:.+]]: i32, %[[KEY:.+]]: i32)
-func.func @channel_split(%base_channel: !hal.channel, %color: i32, %key: i32) -> !hal.channel {
+util.func public @channel_split(%base_channel: !hal.channel, %color: i32, %key: i32) -> !hal.channel {
   // CHECK: %[[FLAGS:.+]] = vm.const.i32.zero
   // CHECK: %[[SPLIT_CHANNEL:.+]] = vm.call @hal.channel.split(%[[BASE_CHANNEL]], %[[COLOR]], %[[KEY]], %[[FLAGS]])
   %split_channel = hal.channel.split<%base_channel : !hal.channel>
                                color(%color)
                                  key(%key)
                                flags(0) : !hal.channel
-  // CHECK: return %[[SPLIT_CHANNEL]]
-  return %split_channel : !hal.channel
+  // CHECK: vm.return %[[SPLIT_CHANNEL]]
+  util.return %split_channel : !hal.channel
 }
 
 // -----
 
 // CHECK-LABEL: @channel_rank_and_count
 //  CHECK-SAME: %[[CHANNEL:.+]]: !vm.ref<!hal.channel>
-func.func @channel_rank_and_count(%channel: !hal.channel) -> (i32, i32) {
+util.func public @channel_rank_and_count(%channel: !hal.channel) -> (i32, i32) {
   // CHECK: %[[RANK_COUNT:.+]]:2 = vm.call @hal.channel.rank_and_count(%[[CHANNEL]])
   %rank, %count = hal.channel.rank_and_count<%channel : !hal.channel> : i32, i32
-  // CHECK: return %[[RANK_COUNT]]#0, %[[RANK_COUNT]]#1
-  return %rank, %count : i32, i32
+  // CHECK: vm.return %[[RANK_COUNT]]#0, %[[RANK_COUNT]]#1
+  util.return %rank, %count : i32, i32
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/command_buffer_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/command_buffer_ops.mlir
index db6304d..432a76e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/command_buffer_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/command_buffer_ops.mlir

@@ -1,34 +1,34 @@
 // RUN: iree-opt --split-input-file --iree-convert-hal-to-vm --canonicalize --iree-vm-target-index-bits=32 %s | FileCheck %s
 
 // CHECK-LABEL: @command_buffer_create
-func.func @command_buffer_create(%arg0: !hal.device) {
+util.func public @command_buffer_create(%arg0: !hal.device) {
   // CHECK: %ref = vm.call @hal.command_buffer.create(%arg0, %c1, %c3, %zero) : (!vm.ref<!hal.device>, i32, i32, i32) -> !vm.ref<!hal.command_buffer>
   %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("OneShot") categories("Transfer|Dispatch") : !hal.command_buffer
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_create_bindings
-func.func @command_buffer_create_bindings(%arg0: !hal.device, %arg1: index) {
+util.func public @command_buffer_create_bindings(%arg0: !hal.device, %arg1: index) {
   // CHECK: %ref = vm.call @hal.command_buffer.create(%arg0, %c1, %c3, %arg1) : (!vm.ref<!hal.device>, i32, i32, i32) -> !vm.ref<!hal.command_buffer>
   %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("OneShot") categories("Transfer|Dispatch") bindings(%arg1) : !hal.command_buffer
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_finalize
-func.func @command_buffer_finalize(%arg0: !hal.command_buffer) {
+util.func public @command_buffer_finalize(%arg0: !hal.command_buffer) {
   // CHECK: vm.call @hal.command_buffer.finalize(%arg0) : (!vm.ref<!hal.command_buffer>) -> ()
   hal.command_buffer.finalize<%arg0 : !hal.command_buffer>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_execution_barrier
-func.func @command_buffer_execution_barrier(
+util.func public @command_buffer_execution_barrier(
   %arg0: !hal.command_buffer,
   %arg1: !hal.buffer
 ) {
@@ -37,13 +37,13 @@
       source("CommandIssue")
       target("CommandProcess")
       flags("None")
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_fill_buffer_i8
-func.func @command_buffer_fill_buffer_i8(
+util.func public @command_buffer_fill_buffer_i8(
   %arg0: !hal.command_buffer,
   %arg1: !hal.buffer,
   %arg2: i8
@@ -56,13 +56,13 @@
   hal.command_buffer.fill_buffer<%arg0 : !hal.command_buffer>
       target(%arg1 : !hal.buffer)[%c100, %c200]
       pattern(%arg2 : i8)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_fill_buffer_i16
-func.func @command_buffer_fill_buffer_i16(
+util.func public @command_buffer_fill_buffer_i16(
   %arg0: !hal.command_buffer,
   %arg1: !hal.buffer,
   %arg2: i16
@@ -75,13 +75,13 @@
   hal.command_buffer.fill_buffer<%arg0 : !hal.command_buffer>
       target(%arg1 : !hal.buffer)[%c100, %c200]
       pattern(%arg2 : i16)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_fill_buffer_i32
-func.func @command_buffer_fill_buffer_i32(
+util.func public @command_buffer_fill_buffer_i32(
   %arg0: !hal.command_buffer,
   %arg1: !hal.buffer,
   %arg2: i32
@@ -93,13 +93,13 @@
   hal.command_buffer.fill_buffer<%arg0 : !hal.command_buffer>
       target(%arg1 : !hal.buffer)[%c100, %c200]
       pattern(%arg2 : i32)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_copy_buffer
-func.func @command_buffer_copy_buffer(
+util.func public @command_buffer_copy_buffer(
   %arg0: !hal.command_buffer,
   %arg1: !hal.buffer
 ) {
@@ -111,7 +111,7 @@
       source(%arg1 : !hal.buffer)[%c100]
       target(%arg1 : !hal.buffer)[%c200]
       length(%c300)
-  return
+  util.return
 }
 
 // -----
@@ -122,7 +122,7 @@
 //  CHECK-SAME:  %[[PARAM:.+]]: i32,
 //  CHECK-SAME:  %[[SEND_BUFFER:.+]]: !vm.ref<!hal.buffer>, %[[RECV_BUFFER:.+]]: !vm.ref<!hal.buffer>,
 //  CHECK-SAME:  %[[COUNT:.+]]: i32)
-func.func @command_buffer_collective_all_reduce_sum(
+util.func public @command_buffer_collective_all_reduce_sum(
     %cmd: !hal.command_buffer,
     %channel: !hal.channel,
     %param: i32,
@@ -146,7 +146,7 @@
       send(%send_buffer : !hal.buffer)[%c10, %c128]
       recv(%recv_buffer : !hal.buffer)[%c20, %c256]
       count(%count)
-  return
+  util.return
 }
 
 // -----
@@ -157,7 +157,7 @@
 //  CHECK-SAME:  %[[PARAM:.+]]: i32,
 //  CHECK-SAME:  %[[SEND_BUFFER:.+]]: !vm.ref<!hal.buffer>,
 //  CHECK-SAME:  %[[COUNT:.+]]: i32)
-func.func @command_buffer_collective_send(
+util.func public @command_buffer_collective_send(
     %cmd: !hal.command_buffer,
     %channel: !hal.channel,
     %param: i32,
@@ -179,7 +179,7 @@
       param(%param : i32)
       send(%send_buffer : !hal.buffer)[%c10, %c128]
       count(%count)
-  return
+  util.return
 }
 
 // -----
@@ -189,7 +189,7 @@
 //  CHECK-SAME: %[[LAYOUT:.+]]: !vm.ref<!hal.pipeline_layout>,
 //  CHECK-SAME: %[[BUFFER:.+]]: !vm.ref<!hal.buffer>,
 //  CHECK-SAME: %[[SLOT:.+]]: i32
-func.func @command_buffer_push_descriptor_set(
+util.func public @command_buffer_push_descriptor_set(
     %cmd: !hal.command_buffer,
     %layout: !hal.pipeline_layout,
     %buffer: !hal.buffer,
@@ -214,13 +214,13 @@
         %c0 = (%buffer : !hal.buffer)[%c4096, %c8000],
         %c1 = (%slot : index)[%c4, %c4096]
       ])
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_dispatch
-func.func @command_buffer_dispatch(
+util.func public @command_buffer_dispatch(
   %arg0: !hal.command_buffer,
   %arg1: !hal.executable
 ) {
@@ -231,13 +231,13 @@
   hal.command_buffer.dispatch<%arg0 : !hal.command_buffer>
       target(%arg1 : !hal.executable)[0]
       workgroups([%c100, %c200, %c300])
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_dispatch_indirect
-func.func @command_buffer_dispatch_indirect(
+util.func public @command_buffer_dispatch_indirect(
   %arg0: !hal.command_buffer,
   %arg1: !hal.executable,
   %arg2: !hal.buffer
@@ -247,5 +247,5 @@
   hal.command_buffer.dispatch.indirect<%arg0 : !hal.command_buffer>
       target(%arg1 : !hal.executable)[0]
       workgroups(%arg2 : !hal.buffer)[%c100]
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/device_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/device_ops.mlir
index ce88e3f..998d1b1 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/device_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/device_ops.mlir

@@ -2,88 +2,88 @@
 
 // CHECK-LABEL: @device_allocator
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>)
-func.func @device_allocator(%device: !hal.device) -> !hal.allocator {
+util.func public @device_allocator(%device: !hal.device) -> !hal.allocator {
   // CHECK: %ref = vm.call @hal.device.allocator(%[[DEVICE]]) {nosideeffects} : (!vm.ref<!hal.device>) -> !vm.ref<!hal.allocator>
   %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
-  return %allocator : !hal.allocator
+  util.return %allocator : !hal.allocator
 }
 
 // -----
 
 // CHECK-LABEL: @device_query_i64
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>)
-func.func @device_query_i64(%device: !hal.device) -> (i1, i64) {
+util.func public @device_query_i64(%device: !hal.device) -> (i1, i64) {
   // CHECK-DAG: %[[NS:.+]] = vm.rodata.inline "_utf8_sys_
   // CHECK-DAG: %[[KEY:.+]] = vm.rodata.inline "_utf8_foo_
   // CHECK: %[[RET:.+]]:2 = vm.call @hal.device.query.i64(%[[DEVICE]], %[[NS]], %[[KEY]]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
   %ok, %value = hal.device.query<%device : !hal.device> key("sys" :: "foo") : i1, i64
-  // CHECK: return %[[RET]]#0, %[[RET]]#1
-  return %ok, %value : i1, i64
+  // CHECK: vm.return %[[RET]]#0, %[[RET]]#1
+  util.return %ok, %value : i1, i64
 }
 
 // -----
 
 // CHECK-LABEL: @device_query_i64_default
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>)
-func.func @device_query_i64_default(%device: !hal.device) -> i64 {
+util.func public @device_query_i64_default(%device: !hal.device) -> i64 {
   // CHECK-DAG: %[[NS:.+]] = vm.rodata.inline "_utf8_sys_
   // CHECK-DAG: %[[KEY:.+]] = vm.rodata.inline "_utf8_foo_
   // CHECK: %[[RET:.+]]:2 = vm.call @hal.device.query.i64(%[[DEVICE]], %[[NS]], %[[KEY]]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
   %ok, %value = hal.device.query<%device : !hal.device> key("sys" :: "foo") : i1, i64 = 123 : i64
   // CHECK: %[[OUT:.+]] = vm.select.i64 %[[RET]]#0, %[[RET]]#1, %c123 : i64
-  // CHECK: return %[[OUT]]
-  return %value : i64
+  // CHECK: vm.return %[[OUT]]
+  util.return %value : i64
 }
 
 // -----
 
 // CHECK-LABEL: @device_query_i32
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>)
-func.func @device_query_i32(%device: !hal.device) -> (i1, i32) {
+util.func public @device_query_i32(%device: !hal.device) -> (i1, i32) {
   // CHECK-DAG: %[[NS:.+]] = vm.rodata.inline "_utf8_sys_
   // CHECK-DAG: %[[KEY:.+]] = vm.rodata.inline "_utf8_foo_
   // CHECK: %[[RET:.+]]:2 = vm.call @hal.device.query.i64(%[[DEVICE]], %[[NS]], %[[KEY]]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
   // CHECK: %[[RET_I32:.+]] = vm.trunc.i64.i32 %[[RET]]#1 : i64 -> i32
   %ok, %value = hal.device.query<%device : !hal.device> key("sys" :: "foo") : i1, i32
-  // CHECK: return %[[RET]]#0, %[[RET_I32]]
-  return %ok, %value : i1, i32
+  // CHECK: vm.return %[[RET]]#0, %[[RET_I32]]
+  util.return %ok, %value : i1, i32
 }
 
 // -----
 
 // CHECK-LABEL: @device_query_i32_default
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>)
-func.func @device_query_i32_default(%device: !hal.device) -> i32 {
+util.func public @device_query_i32_default(%device: !hal.device) -> i32 {
   // CHECK-DAG: %[[NS:.+]] = vm.rodata.inline "_utf8_sys_
   // CHECK-DAG: %[[KEY:.+]] = vm.rodata.inline "_utf8_foo_
   // CHECK: %[[RET:.+]]:2 = vm.call @hal.device.query.i64(%[[DEVICE]], %[[NS]], %[[KEY]]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
   // CHECK: %[[RET_I32:.+]] = vm.trunc.i64.i32 %[[RET]]#1 : i64 -> i32
   %ok, %value = hal.device.query<%device : !hal.device> key("sys" :: "foo") : i1, i32 = 123 : i32
   // CHECK: %[[OUT:.+]] = vm.select.i32 %[[RET]]#0, %[[RET_I32]], %c123 : i32
-  // CHECK: return %[[OUT]]
-  return %value : i32
+  // CHECK: vm.return %[[OUT]]
+  util.return %value : i32
 }
 
 // -----
 
 // CHECK-LABEL: @device_query_i1
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>)
-func.func @device_query_i1(%device: !hal.device) -> (i1, i1) {
+util.func public @device_query_i1(%device: !hal.device) -> (i1, i1) {
   // CHECK-DAG: %[[NS:.+]] = vm.rodata.inline "_utf8_sys_
   // CHECK-DAG: %[[KEY:.+]] = vm.rodata.inline "_utf8_foo_
   // CHECK: %[[RET:.+]]:2 = vm.call @hal.device.query.i64(%[[DEVICE]], %[[NS]], %[[KEY]]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
   // CHECK: %[[RET_I32:.+]] = vm.trunc.i64.i32 %[[RET]]#1 : i64 -> i32
   %ok, %value = hal.device.query<%device : !hal.device> key("sys" :: "foo") : i1, i1
   // CHECK: %[[I1:.+]] = vm.and.i32 %[[RET_I32]], %c1 : i32
-  // CHECK: return %[[RET]]#0, %[[I1]]
-  return %ok, %value : i1, i1
+  // CHECK: vm.return %[[RET]]#0, %[[I1]]
+  util.return %ok, %value : i1, i1
 }
 
 // -----
 
 // CHECK-LABEL: @device_query_i1_default
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>)
-func.func @device_query_i1_default(%device: !hal.device) -> i1 {
+util.func public @device_query_i1_default(%device: !hal.device) -> i1 {
   // CHECK-DAG: %[[NS:.+]] = vm.rodata.inline "_utf8_sys_
   // CHECK-DAG: %[[KEY:.+]] = vm.rodata.inline "_utf8_foo_
   // CHECK: %[[RET:.+]]:2 = vm.call @hal.device.query.i64(%[[DEVICE]], %[[NS]], %[[KEY]]) {nosideeffects} : (!vm.ref<!hal.device>, !vm.buffer, !vm.buffer) -> (i32, i64)
@@ -91,14 +91,14 @@
   %ok, %value = hal.device.query<%device : !hal.device> key("sys" :: "foo") : i1, i1 = 1 : i1
   // CHECK: %[[I1:.+]] = vm.and.i32 %[[RET_I32]], %c1 : i32
   // CHECK: %[[OUT:.+]] = vm.select.i32 %[[RET]]#0, %[[I1]], %c1
-  // CHECK: return %[[OUT]]
-  return %value : i1
+  // CHECK: vm.return %[[OUT]]
+  util.return %value : i1
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_alloca
-func.func @device_queue_alloca(
+util.func public @device_queue_alloca(
     // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !vm.ref<!hal.fence>, %[[SIGNAL_FENCE:.+]]: !vm.ref<!hal.fence>,
@@ -117,13 +117,13 @@
       pool(%c100_i64)
       type(DeviceLocal) usage(Transfer)
       : !hal.buffer{%size}
-  return %buffer : !hal.buffer
+  util.return %buffer : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_dealloca
-func.func @device_queue_dealloca(
+util.func public @device_queue_dealloca(
     // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !vm.ref<!hal.fence>, %[[SIGNAL_FENCE:.+]]: !vm.ref<!hal.fence>,
@@ -138,13 +138,13 @@
       affinity(%affinity)
       wait(%wait_fence) signal(%signal_fence)
       buffer(%buffer : !hal.buffer)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_read
-func.func @device_queue_read(
+util.func public @device_queue_read(
     // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !vm.ref<!hal.fence>, %[[SIGNAL_FENCE:.+]]: !vm.ref<!hal.fence>,
@@ -173,13 +173,13 @@
       target(%target_buffer : !hal.buffer)[%target_offset]
       length(%length)
       flags(0)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_execute
-func.func @device_queue_execute(
+util.func public @device_queue_execute(
     // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !vm.ref<!hal.fence>, %[[SIGNAL_FENCE:.+]]: !vm.ref<!hal.fence>,
@@ -194,17 +194,17 @@
       affinity(%affinity)
       wait(%wait_fence) signal(%signal_fence)
       commands([%cmd0, %cmd1])
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_flush
-func.func @device_queue_flush(
+util.func public @device_queue_flush(
     // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[AFFINITY:.+]]: i64)
     %device: !hal.device, %affinity: i64) {
   // CHECK: vm.call @hal.device.queue.flush(%[[DEVICE]], %[[AFFINITY]])
   hal.device.queue.flush<%device : !hal.device>
       affinity(%affinity)
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/devices_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/devices_ops.mlir
index b8423ad..c3dbee6 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/devices_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/devices_ops.mlir

@@ -1,18 +1,18 @@
 // RUN: iree-opt --split-input-file --iree-convert-hal-to-vm --canonicalize --iree-vm-target-index-bits=32 %s | FileCheck %s
 
 // CHECK-LABEL: @devices_count
-func.func @devices_count() -> index {
+util.func public @devices_count() -> index {
   // CHECK: = vm.call @hal.devices.count() {nosideeffects} : () -> i32
   %device_count = hal.devices.count : index
-  return %device_count : index
+  util.return %device_count : index
 }
 
 // -----
 
 // CHECK-LABEL: @devices_get
 // CHECK-SAME: (%[[INDEX:.+]]: i32)
-func.func @devices_get(%index: index) -> !hal.device {
+util.func public @devices_get(%index: index) -> !hal.device {
   // CHECK: = vm.call @hal.devices.get(%[[INDEX]]) {nosideeffects} : (i32) -> !vm.ref<!hal.device>
   %device = hal.devices.get %index : !hal.device
-  return %device : !hal.device
+  util.return %device : !hal.device
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/executable_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/executable_ops.mlir
index b107b7b..e449189 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/executable_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/executable_ops.mlir

@@ -14,7 +14,7 @@
 }
 
 // CHECK-LABEL: @executableCreate
-func.func @executableCreate(
+util.func public @executableCreate(
     // CHECK-SAME: %[[DEV:.+]]: !vm.ref<!hal.device>
     %device: !hal.device,
     // CHECK-SAME: %[[LAYOUT0:.+]]: !vm.ref<!hal.pipeline_layout>,
@@ -40,7 +40,7 @@
   %1 = hal.executable.create device(%device : !hal.device) target(@exe::@binary2) layouts([%layout1, %layout0]) : !hal.executable
 
   // CHECK: vm.return %[[EXE1]], %[[EXE2]]
-  return %0, %1 : !hal.executable, !hal.executable
+  util.return %0, %1 : !hal.executable, !hal.executable
 }
 
 // -----
@@ -61,7 +61,7 @@
 }
 
 // CHECK-LABEL: @multipleExecutables
-func.func @multipleExecutables(
+util.func public @multipleExecutables(
     %device: !hal.device,
     %layout0: !hal.pipeline_layout,
     %layout1: !hal.pipeline_layout
@@ -72,7 +72,7 @@
   // CHECK-DAG: %[[FORMAT2:.+]] = vm.rodata.inline "_utf8_format_
   // CHECK-DAG: %[[BINARY2:.+]] = vm.const.ref.rodata @exe2_binary2 : !vm.buffer
   %1 = hal.executable.create device(%device : !hal.device) target(@exe2::@binary2) layouts([%layout1, %layout0]) : !hal.executable
-  return %0, %1 : !hal.executable, !hal.executable
+  util.return %0, %1 : !hal.executable, !hal.executable
 }
 
 // -----
@@ -86,7 +86,7 @@
 }
 
 // CHECK-LABEL: @executableConstants
-func.func @executableConstants(
+util.func public @executableConstants(
     // CHECK-SAME: %[[DEV:.+]]: !vm.ref<!hal.device>
     %device: !hal.device,
     // CHECK-SAME: %[[LAYOUT:.+]]: !vm.ref<!hal.pipeline_layout>
@@ -117,5 +117,5 @@
       constants([%constant0, %c0, %constant1]) : !hal.executable
 
   // CHECK: vm.return %[[EXE]]
-  return %0 : !hal.executable
+  util.return %0 : !hal.executable
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/fence_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/fence_ops.mlir
index 6eb570e..995ed77 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/fence_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/test/fence_ops.mlir

@@ -2,57 +2,57 @@
 
 // CHECK-LABEL: @fence_create
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>)
-func.func @fence_create(%device: !hal.device) -> !hal.fence {
+util.func public @fence_create(%device: !hal.device) -> !hal.fence {
   // CHECK: %[[FLAGS:.+]] = vm.const.i32.zero
   // CHECK: %[[FENCE:.+]] = vm.call @hal.fence.create(%[[DEVICE]], %[[FLAGS]])
   %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
   // CHECK: vm.return %[[FENCE]]
-  return %fence : !hal.fence
+  util.return %fence : !hal.fence
 }
 
 // -----
 
 // CHECK-LABEL: @fence_join
 // CHECK-SAME: (%[[FENCE0:.+]]: !vm.ref<!hal.fence>, %[[FENCE1:.+]]: !vm.ref<!hal.fence>)
-func.func @fence_join(%fence0: !hal.fence, %fence1: !hal.fence) -> !hal.fence {
+util.func public @fence_join(%fence0: !hal.fence, %fence1: !hal.fence) -> !hal.fence {
   // CHECK: %[[JOIN:.+]] = vm.call.variadic @hal.fence.join
   // CHECK-SAME: ([%[[FENCE0]], %[[FENCE1]]])
   %fence = hal.fence.join at([%fence0, %fence1]) -> !hal.fence
   // CHECK: vm.return %[[JOIN]]
-  return %fence : !hal.fence
+  util.return %fence : !hal.fence
 }
 
 // -----
 
 // CHECK-LABEL: @fence_query
 // CHECK-SAME: (%[[FENCE:.+]]: !vm.ref<!hal.fence>)
-func.func @fence_query(%fence: !hal.fence) -> i32 {
+util.func public @fence_query(%fence: !hal.fence) -> i32 {
   // CHECK: %[[STATUS:.+]] = vm.call @hal.fence.query(%[[FENCE]])
   %status = hal.fence.query<%fence : !hal.fence> : i32
   // CHECK: vm.return %[[STATUS]]
-  return %status : i32
+  util.return %status : i32
 }
 
 // -----
 
 // CHECK-LABEL: @fence_signal
 // CHECK-SAME: (%[[FENCE:.+]]: !vm.ref<!hal.fence>)
-func.func @fence_signal(%fence: !hal.fence) {
+util.func public @fence_signal(%fence: !hal.fence) {
   // CHECK: vm.call @hal.fence.signal(%[[FENCE]])
   hal.fence.signal<%fence : !hal.fence>
   // CHECK: vm.return
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @fence_fail
 // CHECK-SAME: (%[[FENCE:.+]]: !vm.ref<!hal.fence>, %[[STATUS:.+]]: i32)
-func.func @fence_fail(%fence: !hal.fence, %status: i32) {
+util.func public @fence_fail(%fence: !hal.fence, %status: i32) {
   // CHECK: vm.call @hal.fence.fail(%[[FENCE]], %[[STATUS]])
   hal.fence.fail<%fence : !hal.fence> status(%status)
   // CHECK: vm.return
-  return
+  util.return
 }
 
 // -----
@@ -60,10 +60,10 @@
 // CHECK-LABEL: @fence_await
 // CHECK-SAME: (%[[FENCE0:.+]]: !vm.ref<!hal.fence>, %[[FENCE1:.+]]: !vm.ref<!hal.fence>,
 // CHECK-SAME:  %[[TIMEOUT:.+]]: i32)
-func.func @fence_await(%fence0: !hal.fence, %fence1: !hal.fence, %timeout: i32) -> i32 {
+util.func public @fence_await(%fence0: !hal.fence, %fence1: !hal.fence, %timeout: i32) -> i32 {
   // CHECK: %[[STATUS:.+]] = vm.call.variadic @hal.fence.await
   // CHECK-SAME: (%[[TIMEOUT]], [%[[FENCE0]], %[[FENCE1]]])
   %status = hal.fence.await until([%fence0, %fence1]) timeout_millis(%timeout) : i32
   // CHECK: vm.return %[[STATUS]]
-  return %status : i32
+  util.return %status : i32
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StandardToHAL/test/shape_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StandardToHAL/test/shape_ops.mlir
index 42fc714..d0f59e1 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StandardToHAL/test/shape_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StandardToHAL/test/shape_ops.mlir

@@ -2,11 +2,11 @@
 
 // CHECK-LABEL: @tensorDim
 // CHECK-SAME: (%[[ARG0:.+]]: !hal.buffer_view)
-func.func @tensorDim(%arg0: tensor<4x?xf32>) -> index {
+util.func public @tensorDim(%arg0: tensor<4x?xf32>) -> index {
   %c1 = arith.constant 1 : index
   // CHECK: %[[DIM:.+]] = hal.buffer_view.dim<%[[ARG0]] : !hal.buffer_view>[1] : index
   %dim = tensor.dim %arg0, %c1 : tensor<4x?xf32>
-  return %dim : index
+  util.return %dim : index
 }
 
 // -----
@@ -16,10 +16,10 @@
 
 // CHECK: @tensorRank
 // CHECK-SAME: (%[[ARG0:.+]]: !hal.buffer_view)
-func.func @tensorRank(%arg0: tensor<4x?xf32>) -> index {
+util.func public @tensorRank(%arg0: tensor<4x?xf32>) -> index {
   // CHECK-NOT: hal.buffer_view.rank
   // CHECK: %[[RANK:.+]] = arith.constant 2
   %rank = tensor.rank %arg0 : tensor<4x?xf32>
-  // CHECK: return %[[RANK]]
-  return %rank : index
+  // CHECK: util.return %[[RANK]]
+  util.return %rank : index
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp
index 16f9e3e..9a15c91 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp

@@ -15,7 +15,6 @@
 #include "iree/compiler/Dialect/Stream/IR/StreamTypes.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -818,13 +817,14 @@
                                                 newResultTypes))) {
       return rewriter.notifyMatchFailure(funcOp, "failed to convert types");
     }
-    auto newOp = rewriter.replaceOpWithNewOp<func::FuncOp>(
-        funcOp, funcOp.getName(),
+    auto newOp = rewriter.replaceOpWithNewOp<IREE::Util::FuncOp>(
+        funcOp, funcOp.getNameAttr(),
         rewriter.getFunctionType(newArgTypes, newResultTypes),
-        funcOp.getSymVisibilityAttr(),
+        /*tied_operands=*/ArrayAttr{}, funcOp.getSymVisibilityAttr(),
         rewriter.getArrayAttr(
             ArrayRef<Attribute>(newArgAttrs.data(), newArgAttrs.size())),
-        funcOp.getAllResultAttrs());
+        funcOp.getAllResultAttrs(),
+        /*inlining_policy=*/IREE::Util::InliningPolicyAttrInterface{});
     newOp->setDialectAttrs(funcOp->getDialectAttrs());
     return success();
   }
@@ -867,8 +867,9 @@
       llvm::append_range(resultTypes, convertedTypes);
     }
 
-    rewriter.replaceOpWithNewOp<func::CallOp>(callOp, callOp.getCalleeAttr(),
-                                              resultTypes, operands);
+    rewriter.replaceOpWithNewOp<IREE::Util::CallOp>(
+        callOp, resultTypes, callOp.getCallee(), operands,
+        /*tied_operands=*/ArrayAttr{});
     return success();
   }
 };

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/channel_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/channel_ops.mlir
index d11abe8..3f88bd1 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/channel_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/channel_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @channel_create
 //  CHECK-SAME: () -> !hal.channel
-func.func @channel_create() -> !stream.channel {
+util.func public @channel_create() -> !stream.channel {
   // CHECK-DAG: %[[DEVICE:.+]] = hal.devices.get %{{.+}} : !hal.device
   // CHECK-DAG: %[[AFFINITY:.+]] = arith.constant 3
   // CHECK-DAG: %[[ID:.+]] = util.null : !util.buffer
@@ -10,15 +10,15 @@
   // CHECK-DAG: %[[DEFAULT:.+]] = arith.constant -1
   // CHECK: %[[CHANNEL:.+]] = hal.channel.create device(%[[DEVICE]] : !hal.device) affinity(%[[AFFINITY]]) flags(0) id(%[[ID]]) group(%[[GROUP]]) rank(%[[DEFAULT]]) count(%[[DEFAULT]]) : !hal.channel
   %channel = stream.channel.create on(#hal.affinity.queue<[0, 1]>) group("group") : !stream.channel
-  // CHECK: return %[[CHANNEL]]
-  return %channel : !stream.channel
+  // CHECK: util.return %[[CHANNEL]]
+  util.return %channel : !stream.channel
 }
 
 // -----
 
 // CHECK-LABEL: @channel_split
 //  CHECK-SAME: (%[[BASE_CHANNEL:.+]]: !hal.channel)
-func.func @channel_split(%base_channel: !stream.channel) {
+util.func public @channel_split(%base_channel: !stream.channel) {
   // CHECK-DAG: %[[COLOR_INDEX:.+]] = arith.constant 100
   %color = arith.constant 100 : index
   // CHECK-DAG: %[[KEY_INDEX:.+]] = arith.constant 101
@@ -27,29 +27,29 @@
   // CHECK-DAG: %[[KEY_I32:.+]] = arith.index_cast %[[KEY_INDEX]] : index to i32
   // CHECK: %channel = hal.channel.split<%[[BASE_CHANNEL]] : !hal.channel> color(%[[COLOR_I32]]) key(%[[KEY_I32]]) flags(0) : !hal.channel
   %split_channel = stream.channel.split %base_channel, %color, %key : !stream.channel -> !stream.channel
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @channel_rank
 //  CHECK-SAME: (%[[CHANNEL:.+]]: !hal.channel)
-func.func @channel_rank(%channel: !stream.channel) -> index {
+util.func public @channel_rank(%channel: !stream.channel) -> index {
   // CHECK: %[[RANK_I32:.+]], %[[COUNT_I32:.+]] = hal.channel.rank_and_count<%[[CHANNEL]] : !hal.channel> : i32, i32
   // CHECK: %[[RANK:.+]] = arith.index_cast %[[RANK_I32]] : i32 to index
   %rank = stream.channel.rank %channel : index
-  // CHECK: return %[[RANK]]
-  return %rank : index
+  // CHECK: util.return %[[RANK]]
+  util.return %rank : index
 }
 
 // -----
 
 // CHECK-LABEL: @channel_count
 //  CHECK-SAME: (%[[CHANNEL:.+]]: !hal.channel) -> index
-func.func @channel_count(%channel: !stream.channel) -> index {
+util.func public @channel_count(%channel: !stream.channel) -> index {
   // CHECK: %[[RANK_I32:.+]], %[[COUNT_I32:.+]] = hal.channel.rank_and_count<%[[CHANNEL]] : !hal.channel> : i32, i32
   // CHECK: %[[COUNT:.+]] = arith.index_cast %[[COUNT_I32]] : i32 to index
   %count = stream.channel.count %channel : index
-  // CHECK: return %[[COUNT]]
-  return %count : index
+  // CHECK: util.return %[[COUNT]]
+  util.return %count : index
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/cmd_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/cmd_ops.mlir
index d9e6873..e0db957 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/cmd_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/cmd_ops.mlir

@@ -4,7 +4,7 @@
 // the normal sequential execution barriers.
 
 // CHECK-LABEL: @cmdMemoryControl
-func.func @cmdMemoryControl(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func public @cmdMemoryControl(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: %[[CMD:.+]] = hal.command_buffer.create
@@ -17,13 +17,13 @@
     stream.cmd.discard %arg2[%c0 for %c128] : !stream.resource<transient>{%arg1}
   } => !stream.timepoint
   // CHECK-NEXT: hal.command_buffer.finalize<%[[CMD]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdFill
-func.func @cmdFill(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func public @cmdFill(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c255_i32 = arith.constant 255 : i32
@@ -36,13 +36,13 @@
     // CHECK-NEXT: hal.command_buffer.execution_barrier<%[[CMD]]
   } => !stream.timepoint
   // CHECK-NEXT: hal.command_buffer.finalize<%[[CMD]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdCopy
-func.func @cmdCopy(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index) -> !stream.timepoint {
+util.func public @cmdCopy(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: %[[CMD:.+]] = hal.command_buffer.create
@@ -55,13 +55,13 @@
     // CHECK-NEXT: hal.command_buffer.execution_barrier<%[[CMD]]
   } => !stream.timepoint
   // CHECK-NEXT: hal.command_buffer.finalize<%[[CMD]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdCollective
-func.func @cmdCollective(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<transient>, %arg3: index, %arg4: !stream.channel) -> !stream.timepoint {
+util.func public @cmdCollective(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<transient>, %arg3: index, %arg4: !stream.channel) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: %[[CMD:.+]] = hal.command_buffer.create
@@ -117,7 +117,7 @@
 
   } => !stream.timepoint
   // CHECK-NEXT: hal.command_buffer.finalize<%[[CMD]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -128,7 +128,7 @@
 // to.
 
 // CHECK-LABEL: @cmdExecute
-func.func @cmdExecute(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: !stream.timepoint) -> !stream.timepoint {
+util.func public @cmdExecute(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: !stream.timepoint) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: %[[CMD:.+]] = hal.command_buffer.create
@@ -158,8 +158,8 @@
   // CHECK-SAME: wait(%arg4)
   // CHECK-SAME: signal(%[[SIGNAL_FENCE]])
   // CHECK-SAME: commands([%[[CMD]]])
-  // CHECK: return %[[SIGNAL_FENCE]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[SIGNAL_FENCE]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -211,7 +211,7 @@
 }
 
 // CHECK-LABEL: @cmdDispatch
-func.func @cmdDispatch(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<external>, %arg3: index) -> !stream.timepoint {
+util.func public @cmdDispatch(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<external>, %arg3: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -277,7 +277,7 @@
     // CHECK: hal.command_buffer.execution_barrier<%[[CMD]]
   } => !stream.timepoint
   // CHECK-NEXT: hal.command_buffer.finalize<%[[CMD]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -285,11 +285,11 @@
 // Tests conversion of streamable calls and function declarations.
 // Expect a command buffer and a buffer + offset + length for each resource.
 
-// CHECK: func.func private @cmdFunc(!hal.command_buffer, !hal.buffer, index, index, i32, !hal.buffer, index, index, !custom.type, !hal.buffer, index, index)
+// CHECK: util.func private @cmdFunc(%arg0: !hal.command_buffer, %arg1: !hal.buffer, %arg2: index, %arg3: index, %arg4: i32, %arg5: !hal.buffer, %arg6: index, %arg7: index, %arg8: !custom.type, %arg9: !hal.buffer, %arg10: index, %arg11: index)
 stream.cmd.func private @cmdFunc(%arg0[%arg1 for %arg2]: !stream.resource<*>, %arg3: i32, %arg4[%arg5 for %arg6]: !stream.resource<*>, %arg7: !custom.type, %arg8[%arg9 for %arg10]: !stream.resource<*>)
 
 // CHECK-LABEL: @cmdCall
-func.func @cmdCall(%arg0: !stream.resource<external>, %arg1: i32, %arg2: !stream.resource<external>, %arg3: !custom.type, %arg4: !stream.resource<external>) -> !stream.timepoint {
+util.func public @cmdCall(%arg0: !stream.resource<external>, %arg1: i32, %arg2: !stream.resource<external>, %arg3: !custom.type, %arg4: !stream.resource<external>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   // CHECK-DAG: %[[SIZE0:.+]] = arith.constant 100
   %size0 = arith.constant 100 : index
@@ -299,11 +299,11 @@
   %size2 = arith.constant 102 : index
   // CHECK: %[[COMMAND_BUFFER:.+]] = hal.command_buffer.create
   %timepoint = stream.cmd.execute with(%arg0 as %stream0: !stream.resource<external>{%size0}, %arg2 as %stream1: !stream.resource<external>{%size1}, %arg4 as %stream2: !stream.resource<external>{%size2}) {
-    // CHECK: call @cmdFunc(%[[COMMAND_BUFFER]], %arg0, %c0, %[[SIZE0]], %arg1, %arg2, %c0, %[[SIZE1]], %arg3, %arg4, %c0, %[[SIZE2]]) :
+    // CHECK: util.call @cmdFunc(%[[COMMAND_BUFFER]], %arg0, %c0, %[[SIZE0]], %arg1, %arg2, %c0, %[[SIZE1]], %arg3, %arg4, %c0, %[[SIZE2]]) :
     // CHECK-SAME: (!hal.command_buffer, !hal.buffer, index, index, i32, !hal.buffer, index, index, !custom.type, !hal.buffer, index, index) -> ()
     stream.cmd.call @cmdFunc(ro %stream0[%c0 for %size0], %arg1, rw %stream1[%c0 for %size1], %arg3, wo %stream2[%c0 for %size2]) : (!stream.resource<external>{%size0}, i32, !stream.resource<external>{%size1}, !custom.type, !stream.resource<external>{%size2}) -> ()
   } => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
@@ -313,7 +313,7 @@
 // the target affinities (0b01 | 0b10 = 0b11 = 3).
 
 // CHECK-LABEL: @cmdExecuteAffinities
-func.func @cmdExecuteAffinities(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: !stream.timepoint) -> !stream.timepoint {
+util.func public @cmdExecuteAffinities(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: !stream.timepoint) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: %[[CMD:.+]] = hal.command_buffer.create
@@ -323,5 +323,5 @@
   // CHECK: hal.device.queue.execute
   // CHECK-SAME: affinity(%c3_i64)
   // CHECK-SAME: commands([%[[CMD]]])
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/context_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/context_ops.mlir
index dd43362..5d73951 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/context_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/context_ops.mlir

@@ -1,42 +1,42 @@
 // RUN: iree-opt --split-input-file --allow-unregistered-dialect --iree-hal-conversion %s | FileCheck %s
 
 // CHECK-LABEL: @contextResolveAllocator
-func.func @contextResolveAllocator() -> !hal.allocator {
+util.func public @contextResolveAllocator() -> !hal.allocator {
   // CHECK: %[[DEVICE:.+]] = hal.devices.get %{{.+}}
   // CHECK: %[[ALLOCATOR:.+]] = hal.device.allocator<%[[DEVICE]] : !hal.device> : !hal.allocator
   %allocator = stream.context.resolve : !hal.allocator
-  // CHECK: return %[[ALLOCATOR]]
-  return %allocator : !hal.allocator
+  // CHECK: util.return %[[ALLOCATOR]]
+  util.return %allocator : !hal.allocator
 }
 
 // -----
 
 // CHECK-LABEL: @contextResolveDevice
-func.func @contextResolveDevice() -> !hal.device {
+util.func public @contextResolveDevice() -> !hal.device {
   // CHECK: %[[DEVICE:.+]] = hal.devices.get %{{.+}}
   %device = stream.context.resolve : !hal.device
-  // CHECK: return %[[DEVICE]]
-  return %device : !hal.device
+  // CHECK: util.return %[[DEVICE]]
+  util.return %device : !hal.device
 }
 
 // -----
 
 // CHECK-LABEL: @contextResolveDeviceQueueAffinityAny
-func.func @contextResolveDeviceQueueAffinityAny() -> (!hal.device, i64) {
+util.func public @contextResolveDeviceQueueAffinityAny() -> (!hal.device, i64) {
   // CHECK-DAG: %[[DEVICE:.+]] = hal.devices.get %{{.+}}
   // CHECK-DAG: %[[QUEUE_AFFINITY:.+]] = arith.constant -1 : i64
   %device, %queue_affinity_any = stream.context.resolve on(#hal.affinity.queue<*>) : !hal.device, i64
-  // CHECK: return %[[DEVICE]], %[[QUEUE_AFFINITY]]
-  return %device, %queue_affinity_any : !hal.device, i64
+  // CHECK: util.return %[[DEVICE]], %[[QUEUE_AFFINITY]]
+  util.return %device, %queue_affinity_any : !hal.device, i64
 }
 
 // -----
 
 // CHECK-LABEL: @contextResolveDeviceQueueAffinity45
-func.func @contextResolveDeviceQueueAffinity45() -> (!hal.device, i64) {
+util.func public @contextResolveDeviceQueueAffinity45() -> (!hal.device, i64) {
   // CHECK: %[[DEVICE:.+]] = hal.devices.get %{{.+}}
   // CHECK-DAG: %[[QUEUE_AFFINITY:.+]] = arith.constant 48 : i64
   %device, %queue_affinity_45 = stream.context.resolve on(#hal.affinity.queue<[4, 5]>) : !hal.device, i64
-  // CHECK: return %[[DEVICE]], %[[QUEUE_AFFINITY]]
-  return %device, %queue_affinity_45 : !hal.device, i64
+  // CHECK: util.return %[[DEVICE]], %[[QUEUE_AFFINITY]]
+  util.return %device, %queue_affinity_45 : !hal.device, i64
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/debug_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/debug_ops.mlir
index a748326..3e3e984 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/debug_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/debug_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @tensorTrace
 // CHECK-SAME: (%[[TENSOR0_BUFFER:.+]]: !hal.buffer, %[[TENSOR0_SIZE:.+]]: index, %[[TENSOR1_BUFFER:.+]]: !hal.buffer, %[[TENSOR1_SIZE:.+]]: index, %[[TENSOR1_DIM0:.+]]: index)
-func.func @tensorTrace(%tensor0: !stream.resource<staging>, %tensor0_size: index, %tensor1: !stream.resource<staging>, %tensor1_size: index, %tensor1_dim0: index) {
+util.func public @tensorTrace(%tensor0: !stream.resource<staging>, %tensor0_size: index, %tensor1: !stream.resource<staging>, %tensor1_size: index, %tensor1_dim0: index) {
   // CHECK-DAG: %[[TENSOR0:.+]] = hal.buffer_view.create buffer(%[[TENSOR0_BUFFER]] : !hal.buffer)[%c0{{.*}}, %[[TENSOR0_SIZE]]] shape([%c5, %c3])
   // CHECK-DAG: %[[TENSOR1:.+]] =  hal.buffer_view.create buffer(%[[TENSOR1_BUFFER]] : !hal.buffer)[%c0{{.*}}, %[[TENSOR1_SIZE]]] shape([%[[TENSOR1_DIM0]], %c5{{.*}}])
   // CHECK: hal.buffer_view.trace "FOOBAR" = %[[TENSOR0]], %[[TENSOR1]] : !hal.buffer_view, !hal.buffer_view
@@ -10,5 +10,5 @@
     %tensor0 : tensor<5x3xf32> in !stream.resource<staging>{%tensor0_size},
     %tensor1 : tensor<?x5xf32>{%tensor1_dim0} in !stream.resource<staging>{%tensor1_size}
   ]
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/file_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/file_ops.mlir
index 9473df7..1182ee4 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/file_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/file_ops.mlir

@@ -2,20 +2,20 @@
 
 // CHECK-LABEL: @file_constant
 //  CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer)
-func.func @file_constant(%buffer: !util.buffer) {
+util.func public @file_constant(%buffer: !util.buffer) {
   %c0 = arith.constant 0 : index
   %c1088 = arith.constant 1088 : index
   // CHECK: %[[DEVICE:.+]] = hal.devices.get %{{.+}}
   // CHECK: = hal.ex.file.from_memory device(%[[DEVICE]] : !hal.device) affinity(%c-1_i64) access(Read) buffer(%[[BUFFER]] : !util.buffer)[%c0 for %c1088] flags(%c0_i32) : !hal.file
   %file = stream.file.constant %buffer[%c0 for %c1088] : !util.buffer{%c1088} -> !stream.file
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @file_read
 //  CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[FILE:.+]]: !hal.file, %[[RESOURCE:.+]]: !hal.buffer)
-func.func @file_read(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) -> !stream.timepoint {
+util.func public @file_read(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c0_i64 = arith.constant 0 : i64
   %c1088 = arith.constant 1088 : index
@@ -23,15 +23,15 @@
   // CHECK: %[[SIGNAL:.+]] = hal.fence.create
   // CHECK: hal.device.queue.read<%[[DEVICE]] : !hal.device> affinity(%c-1_i64) wait(%[[WAIT]]) signal(%[[SIGNAL]]) source(%[[FILE]] : !hal.file)[%c0_i64] target(%[[RESOURCE]] : !hal.buffer)[%c0] length(%c1088) flags(0)
   %signal = stream.file.read await(%wait) => %file[%c0_i64], %resource[%c0], %c1088 : !stream.file -> !stream.resource<variable>{%c1088} => !stream.timepoint
-  // CHECK: return %[[SIGNAL]]
-  return %signal : !stream.timepoint
+  // CHECK: util.return %[[SIGNAL]]
+  util.return %signal : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @file_write
 //  CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[FILE:.+]]: !hal.file, %[[RESOURCE:.+]]: !hal.buffer)
-func.func @file_write(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) -> !stream.timepoint {
+util.func public @file_write(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c0_i64 = arith.constant 0 : i64
   %c1088 = arith.constant 1088 : index
@@ -39,6 +39,6 @@
   // CHECK: %[[SIGNAL:.+]] = hal.fence.create
   // CHECK: hal.device.queue.write<%[[DEVICE]] : !hal.device> affinity(%c-1_i64) wait(%[[WAIT]]) signal(%[[SIGNAL]]) source(%[[RESOURCE]] : !hal.buffer)[%c0] target(%[[FILE]] : !hal.file)[%c0_i64] length(%c1088) flags(0)
   %signal = stream.file.write await(%wait) => %resource[%c0], %file[%c0_i64], %c1088 : !stream.resource<variable>{%c1088} -> !stream.file => !stream.timepoint
-  // CHECK: return %[[SIGNAL]]
-  return %signal : !stream.timepoint
+  // CHECK: util.return %[[SIGNAL]]
+  util.return %signal : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/resource_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/resource_ops.mlir
index 88cb014..6af93ee 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/resource_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/resource_ops.mlir

@@ -1,21 +1,21 @@
 // RUN: iree-opt --split-input-file --iree-hal-conversion %s | FileCheck %s
 
 // CHECK-LABEL: @resourceAlloc
-func.func @resourceAlloc(%arg0: index) -> !stream.resource<transient> {
+util.func public @resourceAlloc(%arg0: index) -> !stream.resource<transient> {
   // CHECK: %[[RET0:.+]] = hal.allocator.allocate
   // CHECK-SAME: type("DeviceVisible|DeviceLocal")
   // CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}")
   // CHECK-SAME: : !hal.buffer{%arg0}
   %0 = stream.resource.alloc uninitialized : !stream.resource<transient>{%arg0}
-  // CHECK: return %[[RET0]]
-  return %0 : !stream.resource<transient>
+  // CHECK: util.return %[[RET0]]
+  util.return %0 : !stream.resource<transient>
 }
 
 // -----
 
 // CHECK-LABEL: @resourceAlloca
 // CHECK-SAME: (%[[SIZE:.+]]: index)
-func.func @resourceAlloca(%size: index) -> (!stream.resource<transient>, !stream.timepoint) {
+util.func public @resourceAlloca(%size: index) -> (!stream.resource<transient>, !stream.timepoint) {
   // CHECK: %[[WAIT_FENCE:.+]] = util.null : !hal.fence
   // CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create
   // CHECK: %[[RET0:.+]] = hal.device.queue.alloca
@@ -27,15 +27,15 @@
   // CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}")
   // CHECK-SAME: : !hal.buffer{%[[SIZE]]}
   %0:2 = stream.resource.alloca uninitialized : !stream.resource<transient>{%size} => !stream.timepoint
-  // CHECK: return %[[RET0]], %[[SIGNAL_FENCE]]
-  return %0#0, %0#1 : !stream.resource<transient>, !stream.timepoint
+  // CHECK: util.return %[[RET0]], %[[SIGNAL_FENCE]]
+  util.return %0#0, %0#1 : !stream.resource<transient>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @resourceAllocaAwait
 // CHECK-SAME: (%[[SIZE:.+]]: index, %[[WAIT_FENCE:.+]]: !hal.fence)
-func.func @resourceAllocaAwait(%size: index, %await_timepoint: !stream.timepoint) -> (!stream.resource<transient>, !stream.timepoint) {
+util.func public @resourceAllocaAwait(%size: index, %await_timepoint: !stream.timepoint) -> (!stream.resource<transient>, !stream.timepoint) {
   // CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create
   // CHECK: %[[RET0:.+]] = hal.device.queue.alloca
   // CHECK-SAME: affinity(%c-1
@@ -46,15 +46,15 @@
   // CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}")
   // CHECK-SAME: : !hal.buffer{%[[SIZE]]}
   %0:2 = stream.resource.alloca uninitialized await(%await_timepoint) => !stream.resource<transient>{%size} => !stream.timepoint
-  // CHECK: return %[[RET0]], %[[SIGNAL_FENCE]]
-  return %0#0, %0#1 : !stream.resource<transient>, !stream.timepoint
+  // CHECK: util.return %[[RET0]], %[[SIGNAL_FENCE]]
+  util.return %0#0, %0#1 : !stream.resource<transient>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @resourceDealloca
 // CHECK-SAME: (%[[SIZE:.+]]: index, %[[RESOURCE:.+]]: !hal.buffer)
-func.func @resourceDealloca(%size: index, %resource: !stream.resource<transient>) -> !stream.timepoint {
+util.func public @resourceDealloca(%size: index, %resource: !stream.resource<transient>) -> !stream.timepoint {
   // CHECK: %[[WAIT_FENCE:.+]] = util.null : !hal.fence
   // CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create
   // CHECK: hal.device.queue.dealloca
@@ -63,8 +63,8 @@
   // CHECK-SAME: signal(%[[SIGNAL_FENCE]])
   // CHECK-SAME: buffer(%[[RESOURCE]] : !hal.buffer)
   %0 = stream.resource.dealloca %resource : !stream.resource<transient>{%size} => !stream.timepoint
-  // CHECK: return %[[SIGNAL_FENCE]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[SIGNAL_FENCE]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -73,7 +73,7 @@
 
 // CHECK-LABEL: @resourceDeallocaAwait
 // CHECK-SAME: (%[[SIZE:.+]]: index, %[[RESOURCE:.+]]: !hal.buffer, %[[WAIT_FENCE:.+]]: !hal.fence)
-func.func @resourceDeallocaAwait(%size: index, %resource: !stream.resource<transient>, %await_timepoint: !stream.timepoint) -> !stream.timepoint {
+util.func public @resourceDeallocaAwait(%size: index, %resource: !stream.resource<transient>, %await_timepoint: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create
   // CHECK: hal.device.queue.dealloca
   // CHECK-SAME: affinity(%c-1
@@ -81,24 +81,24 @@
   // CHECK-SAME: signal(%[[SIGNAL_FENCE]])
   // CHECK-SAME: buffer(%[[RESOURCE]] : !hal.buffer)
   %0 = stream.resource.dealloca await(%await_timepoint) => %resource : !stream.resource<transient>{%size} => !stream.timepoint
-  // CHECK: return %[[SIGNAL_FENCE]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[SIGNAL_FENCE]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @resourceSize
-func.func @resourceSize(%arg0: !stream.resource<transient>) -> index {
+util.func public @resourceSize(%arg0: !stream.resource<transient>) -> index {
   // CHECK: %[[SIZE:.+]] = hal.buffer.length<%arg0 : !hal.buffer> : index
   %0 = stream.resource.size %arg0 : !stream.resource<transient>
-  // CHECK: return %[[SIZE]]
-  return %0 : index
+  // CHECK: util.return %[[SIZE]]
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @resourceTryMap
-func.func @resourceTryMap(%arg0: !util.buffer) -> (i1, !stream.resource<constant>) {
+util.func public @resourceTryMap(%arg0: !util.buffer) -> (i1, !stream.resource<constant>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: %[[DID_IMPORT:.+]], %[[IMPORTED:.+]] = hal.allocator.import
@@ -106,40 +106,40 @@
   // CHECK-SAME: type("DeviceVisible|DeviceLocal")
   // CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}SharingImmutable") : i1, !hal.
   %did_map, %mapping = stream.resource.try_map %arg0[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128}
-  // CHECK: return %[[DID_IMPORT]], %[[IMPORTED]]
-  return %did_map, %mapping : i1, !stream.resource<constant>
+  // CHECK: util.return %[[DID_IMPORT]], %[[IMPORTED]]
+  util.return %did_map, %mapping : i1, !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @resourceLoad
-func.func @resourceLoad(%arg0: !stream.resource<staging>, %arg1: index) -> i32 {
+util.func public @resourceLoad(%arg0: !stream.resource<staging>, %arg1: index) -> i32 {
   %c4 = arith.constant 4 : index
   // CHECK: %[[RET0:.+]] = hal.buffer.load<%arg0 : !hal.buffer>[%c4] : i32
   %0 = stream.resource.load %arg0[%c4] : !stream.resource<staging>{%arg1} -> i32
-  // CHECK: return %[[RET0]]
-  return %0 : i32
+  // CHECK: util.return %[[RET0]]
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @resourceStore
-func.func @resourceStore(%arg0: !stream.resource<staging>, %arg1: index) {
+util.func public @resourceStore(%arg0: !stream.resource<staging>, %arg1: index) {
   %c4 = arith.constant 4 : index
   %c123_i32 = arith.constant 123 : i32
   // CHECK: hal.buffer.store<%arg0 : !hal.buffer>[%c4] value(%c123_i32 : i32)
   stream.resource.store %c123_i32, %arg0[%c4] : i32 -> !stream.resource<staging>{%arg1}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @resourceSubview
-func.func @resourceSubview(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.resource<transient> {
+util.func public @resourceSubview(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.resource<transient> {
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
   // CHECK: %[[RET0:.+]] = hal.buffer.subspan<%arg0 : !hal.buffer>[%c128, %c256] : !hal.buffer
   %0 = stream.resource.subview %arg0[%c128] : !stream.resource<transient>{%arg1} -> !stream.resource<transient>{%c256}
-  // CHECK: return %[[RET0]]
-  return %0 : !stream.resource<transient>
+  // CHECK: util.return %[[RET0]]
+  util.return %0 : !stream.resource<transient>
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/timepoint_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/timepoint_ops.mlir
index cca49b1..8a7b691 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/timepoint_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/timepoint_ops.mlir

@@ -2,86 +2,86 @@
 
 // CHECK-LABEL: util.global private mutable @rwTimepoint : !hal.fence
 util.global private mutable @rwTimepoint = #stream.timepoint<immediate>
-// CHECK: func.func @globalTimepoint(%arg0: !hal.fence) -> !hal.fence
-func.func @globalTimepoint(%arg0: !stream.timepoint) -> !stream.timepoint {
+// CHECK: util.func public @globalTimepoint(%arg0: !hal.fence) -> !hal.fence
+util.func public @globalTimepoint(%arg0: !stream.timepoint) -> !stream.timepoint {
   // CHECK: util.global.store %arg0, @rwTimepoint
   util.global.store %arg0, @rwTimepoint : !stream.timepoint
   // CHECK: %[[VALUE:.+]] = util.global.load @rwTimepoint
   %value = util.global.load @rwTimepoint : !stream.timepoint
-  // CHECK: return %[[VALUE]]
-  return %value : !stream.timepoint
+  // CHECK: util.return %[[VALUE]]
+  util.return %value : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointImmediate
-func.func @timepointImmediate() -> !stream.timepoint {
+util.func public @timepointImmediate() -> !stream.timepoint {
   // CHECK: %[[FENCE:.+]] = util.null : !hal.fence
   %0 = stream.timepoint.immediate => !stream.timepoint
-  // CHECK: return %[[FENCE]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[FENCE]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointImportFence
-func.func @timepointImportFence(%arg0: !hal.fence) -> !stream.timepoint {
+util.func public @timepointImportFence(%arg0: !hal.fence) -> !stream.timepoint {
   %0 = stream.timepoint.import %arg0 : (!hal.fence) => !stream.timepoint
-  // CHECK: return %arg0
-  return %0 : !stream.timepoint
+  // CHECK: util.return %arg0
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointExportFence
-func.func @timepointExportFence(%arg0: !stream.timepoint) -> !hal.fence {
+util.func public @timepointExportFence(%arg0: !stream.timepoint) -> !hal.fence {
   %0 = stream.timepoint.export %arg0 => (!hal.fence)
-  // CHECK: return %arg0
-  return %0 : !hal.fence
+  // CHECK: util.return %arg0
+  util.return %0 : !hal.fence
 }
 
 // -----
 
 // CHECK-LABEL: @timepointChainExternal
 //  CHECK-SAME: (%[[TIMEPOINT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
-func.func @timepointChainExternal(%timepoint: !stream.timepoint, %signal: !hal.fence) {
+util.func public @timepointChainExternal(%timepoint: !stream.timepoint, %signal: !hal.fence) {
   // CHECK: %[[DEVICE:.+]] = hal.devices.get %{{.+}}
   // CHECK: hal.device.queue.execute<%[[DEVICE]] : !hal.device> affinity(%c-1_i64) wait(%[[TIMEPOINT]]) signal(%[[SIGNAL]])
   stream.timepoint.chain_external %timepoint => (%signal : !hal.fence)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @timepointJoin
-func.func @timepointJoin(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
+util.func public @timepointJoin(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[FENCE:.+]] = hal.fence.join at([%arg0, %arg1]) -> !hal.fence
   %0 = stream.timepoint.join max(%arg0, %arg1) => !stream.timepoint
-  // CHECK: return %[[FENCE]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[FENCE]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointBarrier
 //  CHECK-SAME: (%[[R0:.+]]: !hal.buffer) -> (!hal.buffer, !hal.fence)
-func.func @timepointBarrier(%r0: !stream.resource<external>) -> (!stream.resource<external>, !stream.timepoint) {
+util.func public @timepointBarrier(%r0: !stream.resource<external>) -> (!stream.resource<external>, !stream.timepoint) {
   %c128 = arith.constant 128 : index
   // CHECK: %[[R1T:.+]] = util.null : !hal.fence
   %r1, %r1t = stream.timepoint.barrier %r0 : !stream.resource<external>{%c128} => !stream.timepoint
-  // CHECK: return %[[R0]], %[[R1T]]
-  return %r1, %r1t : !stream.resource<external>, !stream.timepoint
+  // CHECK: util.return %[[R0]], %[[R1T]]
+  util.return %r1, %r1t : !stream.resource<external>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointAwait
-func.func @timepointAwait(%arg0: !stream.timepoint, %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>) -> (!stream.resource<staging>, !stream.resource<*>) {
+util.func public @timepointAwait(%arg0: !stream.timepoint, %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>) -> (!stream.resource<staging>, !stream.resource<*>) {
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   // CHECK: %[[WAIT_OK:.+]] = hal.fence.await until([%arg0]) timeout_millis(%c-1_i32) : i32
   // CHECK-NEXT: util.status.check_ok %[[WAIT_OK]]
   %0:2 = stream.timepoint.await %arg0 => %arg1, %arg2 : !stream.resource<staging>{%c100}, !stream.resource<*>{%c200}
-  // CHECK: return %arg1, %arg2
-  return %0#0, %0#1 : !stream.resource<staging>, !stream.resource<*>
+  // CHECK: util.return %arg1, %arg2
+  util.return %0#0, %0#1 : !stream.resource<staging>, !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/transfer_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/transfer_ops.mlir
index f594e7c..1dbcc24 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/transfer_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/transfer_ops.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --iree-hal-conversion %s | FileCheck %s
 
 // CHECK-LABEL: @tensorImportBuffer
-func.func @tensorImportBuffer(%arg0: !hal.buffer, %arg1: index) -> !stream.resource<external> {
+util.func public @tensorImportBuffer(%arg0: !hal.buffer, %arg1: index) -> !stream.resource<external> {
   %c20 = arith.constant 20 : index
   // CHECK-DAG: %[[ALLOCATOR:.+]] = hal.device.allocator
   // CHECK: hal.buffer.assert<%arg0 : !hal.buffer>
@@ -11,8 +11,8 @@
   // CHECK-SAME: type(DeviceVisible)
   // CHECK-SAME: usage("Transfer{{.+}}Dispatch{{.+}}")
   %0 = stream.tensor.import %arg0 : !hal.buffer -> tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20}
-  // CHECK: return %arg0
-  return %0 : !stream.resource<external>
+  // CHECK: util.return %arg0
+  util.return %0 : !stream.resource<external>
 }
 
 // -----
@@ -22,7 +22,7 @@
 // buffer itself.
 
 // CHECK-LABEL: @tensorImportBufferView
-func.func @tensorImportBufferView(%arg0: !hal.buffer_view, %arg1: index) -> !stream.resource<external> {
+util.func public @tensorImportBufferView(%arg0: !hal.buffer_view, %arg1: index) -> !stream.resource<external> {
   %c20 = arith.constant 20 : index
   // CHECK-DAG: %[[BUFFER:.+]] = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
   // CHECK-DAG: %[[ALLOCATOR:.+]] = hal.device.allocator
@@ -33,24 +33,24 @@
   // CHECK-SAME: type(DeviceVisible)
   // CHECK-SAME: usage("Transfer{{.+}}Dispatch{{.+}}")
   %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20}
-  // CHECK: return %[[BUFFER]]
-  return %0 : !stream.resource<external>
+  // CHECK: util.return %[[BUFFER]]
+  util.return %0 : !stream.resource<external>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorExportBuffer
-func.func @tensorExportBuffer(%arg0: !stream.resource<external>, %arg1: index) -> !hal.buffer {
+util.func public @tensorExportBuffer(%arg0: !stream.resource<external>, %arg1: index) -> !hal.buffer {
   %c200 = arith.constant 200 : index
   %0 = stream.tensor.export %arg0 : tensor<?x1x10xf32>{%arg1} in !stream.resource<external>{%c200} -> !hal.buffer
-  // CHECK: return %arg0 : !hal.buffer
-  return %0 : !hal.buffer
+  // CHECK: util.return %arg0 : !hal.buffer
+  util.return %0 : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @tensorExportBufferView
-func.func @tensorExportBufferView(%arg0: !stream.resource<external>, %arg1: index) -> !hal.buffer_view {
+util.func public @tensorExportBufferView(%arg0: !stream.resource<external>, %arg1: index) -> !hal.buffer_view {
   %c200 = arith.constant 200 : index
   // CHECK-DAG: %[[ELEMENT_TYPE:.+]] = hal.element_type<f32> : i32
   // CHECK-DAG: %[[ENCODING_TYPE:.+]] = hal.encoding_type<dense_row_major> : i32
@@ -61,6 +61,6 @@
   // CHECK-SAME: encoding(%[[ENCODING_TYPE]])
   // CHECK-SAME: : !hal.buffer_view
   %0 = stream.tensor.export %arg0 : tensor<?x1x10xf32>{%arg1} in !stream.resource<external>{%c200} -> !hal.buffer_view
-  // CHECK: return %[[VIEW]]
-  return %0 : !hal.buffer_view
+  // CHECK: util.return %[[VIEW]]
+  util.return %0 : !hal.buffer_view
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/UtilToHAL/test/global_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/UtilToHAL/test/global_ops.mlir
index b25d3c2..0686f0e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/UtilToHAL/test/global_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/UtilToHAL/test/global_ops.mlir

@@ -9,11 +9,11 @@
 
 // CHECK-LABEL: @resourceGlobals
 // CHECK-SAME: (%[[ARG0:.+]]: !hal.buffer) -> !hal.buffer
-func.func private @resourceGlobals(%arg0: !stream.resource<variable>) -> !stream.resource<variable> {
+util.func private @resourceGlobals(%arg0: !stream.resource<variable>) -> !stream.resource<variable> {
   // CHECK: util.global.store %[[ARG0]], @resource : !hal.buffer
   util.global.store %arg0, @resource : !stream.resource<variable>
   // CHECK: %[[VALUE:.+]] = util.global.load @resource : !hal.buffer
   %value = util.global.load @resource : !stream.resource<variable>
-  // CHECK: return %[[VALUE]]
-  return %value : !stream.resource<variable>
+  // CHECK: util.return %[[VALUE]]
+  util.return %value : !stream.resource<variable>
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/allocator_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/allocator_ops.mlir
index cd35bf6..628a903 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/allocator_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/allocator_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @allocator_allocate
 //  CHECK-SAME: (%[[ALLOCATOR:.+]]: !hal.allocator)
-func.func @allocator_allocate(%allocator: !hal.allocator) {
+util.func public @allocator_allocate(%allocator: !hal.allocator) {
   // CHECK-DAG: %[[AFFINITY:.+]] = arith.constant -1
   %affinity = arith.constant -1 : i64
   // CHECK-DAG: %[[SIZE:.+]] = arith.constant 123
@@ -13,14 +13,14 @@
   // CHECK-SAME:   : !hal.buffer{%[[SIZE]]}
   %ref = hal.allocator.allocate<%allocator : !hal.allocator>
       affinity(%affinity) type(HostLocal) usage(Transfer) : !hal.buffer{%size}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @allocator_import
 //  CHECK-SAME: %[[ALLOCATOR:.+]]: !hal.allocator
-func.func @allocator_import(%allocator: !hal.allocator, %arg1: !util.buffer) {
+util.func public @allocator_import(%allocator: !hal.allocator, %arg1: !util.buffer) {
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 100
   %offset = arith.constant 100 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 200
@@ -36,5 +36,5 @@
   %ok, %ref = hal.allocator.import<%allocator : !hal.allocator>
       source(%arg1 : !util.buffer)[%offset, %length]
       affinity(%affinity) type(DeviceLocal) usage(Transfer) : i1, !hal.buffer
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_ops.mlir
index 88b51b5..278ff8e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_ops.mlir

@@ -1,44 +1,44 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @buffer_subspan
-func.func @buffer_subspan(%arg0: !hal.buffer) -> !hal.buffer {
+util.func public @buffer_subspan(%arg0: !hal.buffer) -> !hal.buffer {
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 100
   %offset = arith.constant 100 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 200
   %length = arith.constant 200 : index
   // CHECK: %buffer = hal.buffer.subspan<%arg0 : !hal.buffer>[%[[OFFSET]], %[[LENGTH]]] : !hal.buffer
   %buffer = hal.buffer.subspan<%arg0 : !hal.buffer>[%offset, %length] : !hal.buffer
-  return %buffer : !hal.buffer
+  util.return %buffer : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_length
-func.func @buffer_length(%arg0: !hal.buffer) -> index {
+util.func public @buffer_length(%arg0: !hal.buffer) -> index {
   // CHECK: hal.buffer.length<%arg0 : !hal.buffer> : index
   %length = hal.buffer.length<%arg0 : !hal.buffer> : index
-  return %length : index
+  util.return %length : index
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_load
-func.func @buffer_load(%arg0: !hal.buffer) -> i32 {
+util.func public @buffer_load(%arg0: !hal.buffer) -> i32 {
   // CHECK-DAG: %[[SRC_OFFSET:.+]] = arith.constant 100
   %src_offset = arith.constant 100 : index
   // CHECK: %[[VAL:.+]] = hal.buffer.load<%arg0 : !hal.buffer>[%[[SRC_OFFSET]]] : i32
   %1 = hal.buffer.load<%arg0 : !hal.buffer>[%src_offset] : i32
-  // CHECK-NEXT: return %[[VAL]]
-  return %1 : i32
+  // CHECK-NEXT: util.return %[[VAL]]
+  util.return %1 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_store
-func.func @buffer_store(%arg0: !hal.buffer, %arg1: i32) {
+util.func public @buffer_store(%arg0: !hal.buffer, %arg1: i32) {
   // CHECK-DAG: %[[DST_OFFSET:.+]] = arith.constant 100
   %dst_offset = arith.constant 100 : index
   // CHECK: hal.buffer.store<%arg0 : !hal.buffer>[%[[DST_OFFSET]]] value(%arg1 : i32)
   hal.buffer.store<%arg0 : !hal.buffer>[%dst_offset] value(%arg1 : i32)
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_view_folding.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_view_folding.mlir
index b2ec3ce..8d94bed 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_view_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_view_folding.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @FoldBufferViewCreateSubspan
 // CHECK-SAME: (%[[BASE_BUFFER:.+]]: !hal.buffer, %[[SUBSPAN_OFFSET:.+]]: index, %[[SUBSPAN_LENGTH:.+]]: index)
-func.func @FoldBufferViewCreateSubspan(%base_buffer: !hal.buffer, %subspan_offset: index, %subspan_length: index) -> !hal.buffer_view {
+util.func public @FoldBufferViewCreateSubspan(%base_buffer: !hal.buffer, %subspan_offset: index, %subspan_length: index) -> !hal.buffer_view {
   %subspan = hal.buffer.subspan<%base_buffer : !hal.buffer>[%subspan_offset, %subspan_length] : !hal.buffer
   // CHECK-DAG: %[[VIEW_OFFSET:.+]] = arith.constant 512
   %view_offset = arith.constant 512 : index
@@ -18,5 +18,5 @@
                                  shape([%dim0])
                                  type(%type)
                                  encoding(%encoding) : !hal.buffer_view
-  return %view : !hal.buffer_view
+  util.return %view : !hal.buffer_view
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_view_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_view_ops.mlir
index 9be19f2..df6848a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_view_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/buffer_view_ops.mlir

@@ -1,27 +1,27 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @element_type
-func.func @element_type() -> i32 {
+util.func public @element_type() -> i32 {
   // CHECK: %[[RET:.+]] = hal.element_type<f32> : i32
   %element_type = hal.element_type<f32> : i32
-  // CHECK: return %[[RET]]
-  return %element_type : i32
+  // CHECK: util.return %[[RET]]
+  util.return %element_type : i32
 }
 
 // -----
 
 // CHECK-LABEL: @encoding_type
-func.func @encoding_type() -> i32 {
+util.func public @encoding_type() -> i32 {
   // CHECK: %[[RET:.+]] = hal.encoding_type<dense_row_major> : i32
   %encoding_type = hal.encoding_type<dense_row_major> : i32
-  // CHECK: return %[[RET]]
-  return %encoding_type : i32
+  // CHECK: util.return %[[RET]]
+  util.return %encoding_type : i32
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_view_create
-func.func @buffer_view_create(%arg0: !hal.buffer, %arg1: index, %arg2: index) -> !hal.buffer_view {
+util.func public @buffer_view_create(%arg0: !hal.buffer, %arg1: index, %arg2: index) -> !hal.buffer_view {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c1_i32 = arith.constant 1 : i32
@@ -35,25 +35,25 @@
                                  shape([%arg1, %arg2])
                                  type(%c32_i32)
                                  encoding(%c1_i32) : !hal.buffer_view
-  return %view : !hal.buffer_view
+  util.return %view : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_view_buffer
-func.func @buffer_view_buffer(%arg0: !hal.buffer_view) -> !hal.buffer {
+util.func public @buffer_view_buffer(%arg0: !hal.buffer_view) -> !hal.buffer {
   // CHECK: %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
   %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
-  return %buffer : !hal.buffer
+  util.return %buffer : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_view_shape_queries
-func.func @buffer_view_shape_queries(%arg0: !hal.buffer_view) -> (index, index) {
+util.func public @buffer_view_shape_queries(%arg0: !hal.buffer_view) -> (index, index) {
   // CHECK: %{{.+}} = hal.buffer_view.rank<%arg0 : !hal.buffer_view> : index
   %0 = hal.buffer_view.rank<%arg0 : !hal.buffer_view> : index
   // CHECK: %{{.+}} = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
   %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
-  return %0, %1 : index, index
+  util.return %0, %1 : index, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/channel_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/channel_ops.mlir
index 5a86fb6..c86ef92 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/channel_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/channel_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @channel_create
 //  CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[AFFINITY:.+]]: i64, %[[ID:.+]]: !util.buffer, %[[GROUP:.+]]: !util.buffer, %[[RANK:.+]]: i32, %[[COUNT:.+]]: i32)
-func.func @channel_create(%device: !hal.device, %affinity: i64, %id: !util.buffer, %group: !util.buffer, %rank: i32, %count: i32) {
+util.func public @channel_create(%device: !hal.device, %affinity: i64, %id: !util.buffer, %group: !util.buffer, %rank: i32, %count: i32) {
   //      CHECK: %channel = hal.channel.create
   // CHECK-SAME:   device(%[[DEVICE]] : !hal.device)
   // CHECK-SAME:   affinity(%[[AFFINITY]])
@@ -18,14 +18,14 @@
                                  group(%group)
                                   rank(%rank)
                                  count(%count) : !hal.channel
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @channel_split
 //  CHECK-SAME: (%[[BASE_CHANNEL:.+]]: !hal.channel, %[[COLOR:.+]]: i32, %[[KEY:.+]]: i32)
-func.func @channel_split(%base_channel: !hal.channel, %color: i32, %key: i32) {
+util.func public @channel_split(%base_channel: !hal.channel, %color: i32, %key: i32) {
   //      CHECK: %channel = hal.channel.split<%[[BASE_CHANNEL]] : !hal.channel>
   // CHECK-SAME:   color(%[[COLOR]])
   // CHECK-SAME:   key(%[[KEY]])
@@ -34,15 +34,15 @@
                               color(%color)
                                 key(%key)
                               flags(0) : !hal.channel
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @channel_rank_and_count
 // CHECK-SAME: (%[[CHANNEL:.+]]: !hal.channel)
-func.func @channel_rank_and_count(%channel: !hal.channel) -> (i32, i32) {
+util.func public @channel_rank_and_count(%channel: !hal.channel) -> (i32, i32) {
   // CHECK: = hal.channel.rank_and_count<%[[CHANNEL]] : !hal.channel> : i32, i32
   %rank, %count = hal.channel.rank_and_count<%channel : !hal.channel> : i32, i32
-  return %rank, %count : i32, i32
+  util.return %rank, %count : i32, i32
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/command_buffer_folding.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/command_buffer_folding.mlir
index 3e1b94d..5ced86a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/command_buffer_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/command_buffer_folding.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @skip_command_buffer_device
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device)
-func.func @skip_command_buffer_device(%device: !hal.device) -> !hal.executable {
+util.func public @skip_command_buffer_device(%device: !hal.device) -> !hal.executable {
   %cmd = hal.command_buffer.create device(%device : !hal.device)
                                      mode(OneShot)
                                categories("Transfer|Dispatch") : !hal.command_buffer
@@ -14,7 +14,7 @@
   %exe = hal.executable.lookup device(%device2 : !hal.device)
                            executable(@executable_name) : !hal.executable
 
-  return %exe : !hal.executable
+  util.return %exe : !hal.executable
 }
 
 // -----
@@ -22,7 +22,7 @@
 // CHECK-LABEL: @fold_buffer_subspan_into_fill_buffer
 //  CHECK-SAME: %[[CMD:.+]]: !hal.command_buffer,
 //  CHECK-SAME: %[[BASE_BUFFER:.+]]: !hal.buffer
-func.func @fold_buffer_subspan_into_fill_buffer(
+util.func public @fold_buffer_subspan_into_fill_buffer(
     %cmd: !hal.command_buffer,
     %buffer: !hal.buffer
   ) {
@@ -37,7 +37,7 @@
       // CHECK-SAME: target(%[[BASE_BUFFER]] : !hal.buffer)[%c108192, %c8192]
       target(%target_subspan : !hal.buffer)[%c100000, %c8192]
       pattern(%c1234_i32 : i32)
-  return
+  util.return
 }
 
 // -----
@@ -45,7 +45,7 @@
 // CHECK-LABEL: @fold_buffer_subspan_into_copy_buffer
 //  CHECK-SAME: %[[CMD:.+]]: !hal.command_buffer,
 //  CHECK-SAME: %[[BASE_BUFFER:.+]]: !hal.buffer
-func.func @fold_buffer_subspan_into_copy_buffer(
+util.func public @fold_buffer_subspan_into_copy_buffer(
     %cmd: !hal.command_buffer,
     %buffer: !hal.buffer
   ) {
@@ -63,7 +63,7 @@
       // CHECK-SAME: target(%[[BASE_BUFFER]] : !hal.buffer)[%c108192]
       target(%target_subspan : !hal.buffer)[%c100000]
       length(%c8192)
-  return
+  util.return
 }
 
 // -----
@@ -72,7 +72,7 @@
 //  CHECK-SAME: %[[CMD:.+]]: !hal.command_buffer,
 //  CHECK-SAME: %[[LAYOUT:.+]]: !hal.pipeline_layout,
 //  CHECK-SAME: %[[BASE_BUFFER:.+]]: !hal.buffer
-func.func @fold_buffer_subspan_into_push_descriptor_set(
+util.func public @fold_buffer_subspan_into_push_descriptor_set(
     %cmd: !hal.command_buffer,
     %layout: !hal.pipeline_layout,
     %buffer: !hal.buffer
@@ -101,5 +101,5 @@
         // CHECK-NEXT: %c2 = (%[[BASE_BUFFER]] : !hal.buffer)[%c4096, %c262144]
         %c2 = (%buffer : !hal.buffer)[%c4096, %c262144]
       ])
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/command_buffer_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/command_buffer_ops.mlir
index 66026aa..cf9f38e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/command_buffer_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/command_buffer_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @command_buffer_create
 //  CHECK-SAME: (%[[DEVICE:.+]]: !hal.device)
-func.func @command_buffer_create(%device: !hal.device) {
+util.func public @command_buffer_create(%device: !hal.device) {
   //      CHECK: %cmd = hal.command_buffer.create
   // CHECK-SAME:   device(%[[DEVICE]] : !hal.device)
   // CHECK-SAME:   mode(OneShot)
@@ -10,34 +10,34 @@
   %cmd = hal.command_buffer.create device(%device : !hal.device)
                                      mode(OneShot)
                                categories("Transfer|Dispatch") : !hal.command_buffer
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_finalize
 //  CHECK-SAME: (%[[CMD:.+]]: !hal.command_buffer)
-func.func @command_buffer_finalize(%cmd: !hal.command_buffer) {
+util.func public @command_buffer_finalize(%cmd: !hal.command_buffer) {
   // CHECK: hal.command_buffer.finalize<%[[CMD]] : !hal.command_buffer>
   hal.command_buffer.finalize<%cmd : !hal.command_buffer>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_device
 //  CHECK-SAME: (%[[CMD:.+]]: !hal.command_buffer)
-func.func @command_buffer_device(%cmd: !hal.command_buffer) {
+util.func public @command_buffer_device(%cmd: !hal.command_buffer) {
   // CHECK: %0 = hal.command_buffer.device<%[[CMD]] : !hal.command_buffer> : !hal.device
   %0 = hal.command_buffer.device<%cmd : !hal.command_buffer> : !hal.device
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @command_buffer_execution_barrier
 //  CHECK-SAME: (%[[CMD:.+]]: !hal.command_buffer)
-func.func @command_buffer_execution_barrier(%cmd: !hal.command_buffer) {
+util.func public @command_buffer_execution_barrier(%cmd: !hal.command_buffer) {
   //      CHECK: hal.command_buffer.execution_barrier<%[[CMD]] : !hal.command_buffer>
   // CHECK-SAME:   source(CommandIssue)
   // CHECK-SAME:   target(CommandProcess)
@@ -46,7 +46,7 @@
       source(CommandIssue)
       target(CommandProcess)
       flags(None)
-  return
+  util.return
 }
 
 // -----
@@ -56,7 +56,7 @@
 //  CHECK-SAME: %[[BUFFER:.+]]: !hal.buffer,
 //  CHECK-SAME: %[[OFFSET:.+]]: index, %[[LENGTH:.+]]: index,
 //  CHECK-SAME: %[[PATTERN:.+]]: i32)
-func.func @command_buffer_fill_buffer(
+util.func public @command_buffer_fill_buffer(
     %cmd: !hal.command_buffer,
     %buffer: !hal.buffer,
     %offset: index,
@@ -68,7 +68,7 @@
   hal.command_buffer.fill_buffer<%cmd : !hal.command_buffer>
       target(%buffer : !hal.buffer)[%offset, %length]
       pattern(%pattern : i32)
-  return
+  util.return
 }
 
 // -----
@@ -78,7 +78,7 @@
 //  CHECK-SAME: %[[BUFFER:.+]]: !hal.buffer,
 //  CHECK-SAME: %[[SRC_OFFSET:.+]]: index, %[[DST_OFFSET:.+]]: index,
 //  CHECK-SAME: %[[LENGTH:.+]]: index)
-func.func @command_buffer_copy_buffer(
+util.func public @command_buffer_copy_buffer(
     %cmd: !hal.command_buffer,
     %buffer: !hal.buffer,
     %src_offset: index,
@@ -93,7 +93,7 @@
       source(%buffer : !hal.buffer)[%src_offset]
       target(%buffer : !hal.buffer)[%dst_offset]
       length(%length)
-  return
+  util.return
 }
 
 // -----
@@ -104,7 +104,7 @@
 //  CHECK-SAME:  %[[PARAM:.+]]: i32,
 //  CHECK-SAME:  %[[SEND_BUFFER:.+]]: !hal.buffer, %[[RECV_BUFFER:.+]]: !hal.buffer,
 //  CHECK-SAME:  %[[COUNT:.+]]: index)
-func.func @command_buffer_collective(
+util.func public @command_buffer_collective(
     %cmd: !hal.command_buffer,
     %channel: !hal.channel,
     %param: i32,
@@ -154,7 +154,7 @@
       recv(%recv_buffer : !hal.buffer)[%c20, %c128]
       count(%count)
 
-  return
+  util.return
 }
 
 // -----
@@ -164,7 +164,7 @@
 //  CHECK-SAME: %[[LAYOUT:.+]]: !hal.pipeline_layout,
 //  CHECK-SAME: %[[BUFFER:.+]]: !hal.buffer,
 //  CHECK-SAME: %[[SLOT:.+]]: index
-func.func @command_buffer_push_descriptor_set(
+util.func public @command_buffer_push_descriptor_set(
     %cmd: !hal.command_buffer,
     %layout: !hal.pipeline_layout,
     %buffer: !hal.buffer,
@@ -185,7 +185,7 @@
         // CHECK-NEXT: %c1 = (%[[SLOT]] : index)[%c4, %c4096]
         %c1 = (%slot : index)[%c4, %c4096]
       ])
-  return
+  util.return
 }
 
 // -----
@@ -204,7 +204,7 @@
 // CHECK-LABEL: @command_buffer_dispatch
 //  CHECK-SAME: (%[[CMD:.+]]: !hal.command_buffer,
 //  CHECK-SAME: %[[X:.+]]: index, %[[Y:.+]]: index, %[[Z:.+]]: index)
-func.func @command_buffer_dispatch(
+util.func public @command_buffer_dispatch(
     %cmd: !hal.command_buffer,
     %x: index,
     %y: index,
@@ -215,7 +215,7 @@
   hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer>
       target(@ex::@backend::@entry0)
       workgroups([%x, %y, %z])
-  return
+  util.return
 }
 
 // -----
@@ -235,7 +235,7 @@
 //  CHECK-SAME: (%[[CMD:.+]]: !hal.command_buffer,
 //  CHECK-SAME:  %[[BUFFER:.+]]: !hal.buffer,
 //  CHECK-SAME:  %[[OFFSET:.+]]: index)
-func.func @command_buffer_dispatch_indirect(
+util.func public @command_buffer_dispatch_indirect(
     %cmd: !hal.command_buffer,
     %buffer: !hal.buffer,
     %offset: index) {
@@ -245,5 +245,5 @@
   hal.command_buffer.dispatch.indirect.symbol<%cmd : !hal.command_buffer>
       target(@ex::@backend::@entry0)
       workgroups(%buffer : !hal.buffer)[%offset]
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/descriptor_set_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/descriptor_set_ops.mlir
index c50ed5d..86180ac 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/descriptor_set_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/descriptor_set_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @descriptor_set_layout_create
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device)
-func.func @descriptor_set_layout_create(%device: !hal.device) {
+util.func public @descriptor_set_layout_create(%device: !hal.device) {
   //      CHECK: = hal.descriptor_set_layout.create
   // CHECK-SAME:     device(%[[DEVICE]] : !hal.device)
   // CHECK-SAME:     flags("None")
@@ -16,5 +16,5 @@
     #hal.descriptor_set.binding<0, storage_buffer>,
     #hal.descriptor_set.binding<1, storage_buffer>
   ]) : !hal.descriptor_set_layout
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/device_folding.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/device_folding.mlir
index 878faab..a27d57a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/device_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/device_folding.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @ImmediatelyResolveDeviceQueueBarrier
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[SIGNAL_FENCE:.+]]: !hal.fence)
-func.func @ImmediatelyResolveDeviceQueueBarrier(%device: !hal.device, %signal_fence: !hal.fence) {
+util.func public @ImmediatelyResolveDeviceQueueBarrier(%device: !hal.device, %signal_fence: !hal.fence) {
   %c-1_i64 = arith.constant -1 : i64
   // CHECK-NOT: util.null
   %wait_fence = util.null : !hal.fence
@@ -12,27 +12,27 @@
       affinity(%c-1_i64)
       wait(%wait_fence)
       signal(%signal_fence)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @HoistDeviceQueueBarrierChain
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[SIGNAL_FENCE:.+]]: !hal.fence)
-func.func @HoistDeviceQueueBarrierChain(%device: !hal.device, %signal_fence: !hal.fence) {
+util.func public @HoistDeviceQueueBarrierChain(%device: !hal.device, %signal_fence: !hal.fence) {
   %c-1_i64 = arith.constant -1 : i64
   // CHECK-NOT: hal.fence.create
   %temp_fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
-  // CHECK: call @external_async_fn(%[[SIGNAL_FENCE]])
-  call @external_async_fn(%temp_fence) : (!hal.fence) -> ()
+  // CHECK: util.call @external_async_fn(%[[SIGNAL_FENCE]])
+  util.call @external_async_fn(%temp_fence) : (!hal.fence) -> ()
   // CHECK-NOT: hal.device.queue.execute
   hal.device.queue.execute<%device : !hal.device>
       affinity(%c-1_i64)
       wait(%temp_fence)
       signal(%signal_fence)
-  return
+  util.return
 }
-func.func private @external_async_fn(!hal.fence)
+util.func private @external_async_fn(!hal.fence)
 
 // -----
 
@@ -41,7 +41,7 @@
 
 // CHECK-LABEL: @HoistDeviceQueueBarrierChainOutOfOrder
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[CMD:.+]]: !hal.command_buffer, %[[WAIT_FENCE:.+]]: !hal.fence)
-func.func @HoistDeviceQueueBarrierChainOutOfOrder(%device: !hal.device, %cmd: !hal.command_buffer, %wait_fence: !hal.fence) -> !hal.fence {
+util.func public @HoistDeviceQueueBarrierChainOutOfOrder(%device: !hal.device, %cmd: !hal.command_buffer, %wait_fence: !hal.fence) -> !hal.fence {
   %c-1_i64 = arith.constant -1 : i64
   // CHECK: %[[FENCE1:.+]] = hal.fence.create {{.+}} {test.fence1}
   %fence0 = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence attributes {test.fence0}
@@ -58,8 +58,8 @@
       affinity(%c-1_i64)
       wait(%fence0)
       signal(%fence1)
-  // CHECK: return %[[FENCE1]]
-  return %fence1 : !hal.fence
+  // CHECK: util.return %[[FENCE1]]
+  util.return %fence1 : !hal.fence
 }
 
 // -----
@@ -69,7 +69,7 @@
 // CHECK-SAME:  %[[CMD:.+]]: !hal.command_buffer,
 // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !hal.fence,
 // CHECK-SAME:  %[[SIGNAL_FENCE:.+]]: !hal.fence)
-func.func @ElideDeviceQueueBarrierOp(
+util.func public @ElideDeviceQueueBarrierOp(
     %device: !hal.device,
     %cmd: !hal.command_buffer,
     %wait_fence: !hal.fence,
@@ -112,6 +112,6 @@
       wait(%fence1)
       signal(%signal_fence)
 
-  // CHECK-NEXT: return
-  return
+  // CHECK-NEXT: util.return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/device_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/device_ops.mlir
index a04e5db..206c3bb 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/device_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/device_ops.mlir

@@ -2,26 +2,26 @@
 
 // CHECK-LABEL: @device_allocator
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device)
-func.func @device_allocator(%device: !hal.device) -> !hal.allocator {
+util.func public @device_allocator(%device: !hal.device) -> !hal.allocator {
   // CHECK: %allocator = hal.device.allocator<%[[DEVICE]] : !hal.device> : !hal.allocator
   %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
-  return %allocator : !hal.allocator
+  util.return %allocator : !hal.allocator
 }
 
 // -----
 
 // CHECK-LABEL: @device_query
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device)
-func.func @device_query(%device : !hal.device) -> (i1, i32) {
+util.func public @device_query(%device : !hal.device) -> (i1, i32) {
   // CHECK: = hal.device.query<%[[DEVICE]] : !hal.device> key("sys" :: "foo") : i1, i32
   %ok, %value = hal.device.query<%device : !hal.device> key("sys" :: "foo") : i1, i32
-  return %ok, %value : i1, i32
+  util.return %ok, %value : i1, i32
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_alloca
-func.func @device_queue_alloca(
+util.func public @device_queue_alloca(
     // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !hal.fence, %[[SIGNAL_FENCE:.+]]: !hal.fence,
@@ -41,13 +41,13 @@
       type(DeviceLocal) usage(Transfer)
       // CHECK-SAME: : !hal.buffer{%[[SIZE]]}
       : !hal.buffer{%size}
-  return %buffer : !hal.buffer
+  util.return %buffer : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_dealloca
-func.func @device_queue_dealloca(
+util.func public @device_queue_dealloca(
     // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !hal.fence, %[[SIGNAL_FENCE:.+]]: !hal.fence,
@@ -62,13 +62,13 @@
       wait(%wait_fence) signal(%signal_fence)
       // CHECK-SAME: buffer(%[[BUFFER]] : !hal.buffer)
       buffer(%buffer : !hal.buffer)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_read
-func.func @device_queue_read(
+util.func public @device_queue_read(
     // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !hal.fence, %[[SIGNAL_FENCE:.+]]: !hal.fence,
@@ -97,13 +97,13 @@
       length(%length)
       // CHECK-SAME: flags(0)
       flags(0)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_write
-func.func @device_queue_write(
+util.func public @device_queue_write(
     // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !hal.fence, %[[SIGNAL_FENCE:.+]]: !hal.fence,
@@ -132,13 +132,13 @@
       length(%length)
       // CHECK-SAME: flags(0)
       flags(0)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_execute
-func.func @device_queue_execute(
+util.func public @device_queue_execute(
     // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[AFFINITY:.+]]: i64,
     %device: !hal.device, %affinity: i64,
     // CHECK-SAME:  %[[WAIT_FENCE:.+]]: !hal.fence, %[[SIGNAL_FENCE:.+]]: !hal.fence,
@@ -153,18 +153,18 @@
       wait(%wait_fence) signal(%signal_fence)
       // CHECK-SAME: commands([%[[CMD0]], %[[CMD1]]])
       commands([%cmd0, %cmd1])
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @device_queue_flush
-func.func @device_queue_flush(
+util.func public @device_queue_flush(
     // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[AFFINITY:.+]]: i64)
     %device: !hal.device, %affinity: i64) {
   // CHECK: hal.device.queue.flush<%[[DEVICE]] : !hal.device>
   hal.device.queue.flush<%device : !hal.device>
       // CHECK-SAME: affinity(%[[AFFINITY]])
       affinity(%affinity)
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/devices_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/devices_ops.mlir
index 633318b..0283400 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/devices_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/devices_ops.mlir

@@ -1,18 +1,18 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @devices_count
-func.func @devices_count() -> index {
+util.func public @devices_count() -> index {
   // CHECK: = hal.devices.count : index
   %device_count = hal.devices.count : index
-  return %device_count : index
+  util.return %device_count : index
 }
 
 // -----
 
 // CHECK-LABEL: @devices_get
 // CHECK-SAME: (%[[INDEX:.+]]: index)
-func.func @devices_get(%index: index) -> !hal.device {
+util.func public @devices_get(%index: index) -> !hal.device {
   // CHECK: = hal.devices.get %[[INDEX]] : !hal.device
   %device = hal.devices.get %index : !hal.device
-  return %device : !hal.device
+  util.return %device : !hal.device
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/executable_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/executable_ops.mlir
index acd872d..6da4866 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/executable_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/executable_ops.mlir

@@ -76,7 +76,7 @@
     hal.executable.condition(%device: !hal.device) -> i1 {
       // CHECK-NEXT: %[[OK:.+]], %[[VALUE:.+]] = hal.device.query<%[[DEVICE]]
       %ok, %value = hal.device.query<%device : !hal.device> key("some" :: "value") : i1, i32
-      // CHECK-NEXT: return %[[OK]]
+      // CHECK-NEXT: hal.return %[[OK]]
       hal.return %ok : i1
     }
 
@@ -144,7 +144,7 @@
 // CHECK-SAME: %[[DEVICE:.+]]: !hal.device,
 // CHECK-SAME: %[[LAYOUT0:.+]]: !hal.pipeline_layout,
 // CHECK-SAME: %[[LAYOUT1:.+]]: !hal.pipeline_layout
-func.func @executable_create(%device: !hal.device,
+util.func public @executable_create(%device: !hal.device,
                         %layout0: !hal.pipeline_layout,
                         %layout1: !hal.pipeline_layout) {
   //      CHECK: = hal.executable.create
@@ -154,7 +154,7 @@
   %0 = hal.executable.create device(%device : !hal.device)
                              target(@exe::@binary1)
                             layouts([%layout0, %layout1]) : !hal.executable
-  return
+  util.return
 }
 
 // -----
@@ -163,7 +163,7 @@
 // CHECK-SAME: %[[DEVICE:.+]]: !hal.device,
 // CHECK-SAME: %[[LAYOUT0:.+]]: !hal.descriptor_set_layout,
 // CHECK-SAME: %[[LAYOUT1:.+]]: !hal.descriptor_set_layout
-func.func @pipeline_layout_create(%device: !hal.device,
+util.func public @pipeline_layout_create(%device: !hal.device,
                                %layout0: !hal.descriptor_set_layout,
                                %layout1: !hal.descriptor_set_layout) {
   // CHECK: hal.pipeline_layout.create
@@ -173,7 +173,7 @@
   %0 = hal.pipeline_layout.create device(%device : !hal.device)
                             push_constants(1)
                                    layouts([%layout0, %layout1]) : !hal.pipeline_layout
-  return
+  util.return
 }
 
 // -----
@@ -197,7 +197,7 @@
 // CHECK-LABEL: @unresolved_workload
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device,
 // CHECK-SAME:  %[[WORKLOAD_0:.+]]: index, %[[WORKLOAD_1:.+]]: index)
-func.func @unresolved_workload(%device: !hal.device,
+util.func public @unresolved_workload(%device: !hal.device,
                                %workload_0: index, %workload_1: index) -> (index, index, index) {
   // CHECK: %[[WORKGROUP_X:.+]], %[[WORKGROUP_Y:.+]], %[[WORKGROUP_Z:.+]] =
   // CHECK-SAME:   hal.executable.calculate_workgroups
@@ -208,6 +208,6 @@
       device(%device : !hal.device)
       target(@unresolved_workload_ex::@backend::@entry0)
       workload([%workload_0, %workload_1]) : index, index, index
-  // CHECK: return %[[WORKGROUP_X]], %[[WORKGROUP_Y]], %[[WORKGROUP_Z]]
-  return %workgroups#0, %workgroups#1, %workgroups#2 : index, index, index
+  // CHECK: util.return %[[WORKGROUP_X]], %[[WORKGROUP_Y]], %[[WORKGROUP_Z]]
+  util.return %workgroups#0, %workgroups#1, %workgroups#2 : index, index, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/experimental_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/experimental_ops.mlir
index 3c8fd84..2d401c5 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/experimental_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/experimental_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @file_from_memory
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[BUFFER:.+]]: !util.buffer)
-func.func @file_from_memory(%device: !hal.device, %buffer: !util.buffer) -> !hal.file {
+util.func public @file_from_memory(%device: !hal.device, %buffer: !util.buffer) -> !hal.file {
   // CHECK-DAG: %[[AFFINITY:.+]] = arith.constant -1
   %affinity = arith.constant -1 : i64
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 100
@@ -24,5 +24,5 @@
       access(Read)
       buffer(%buffer : !util.buffer)[%offset for %length]
       flags(%flags) : !hal.file
-  return %file : !hal.file
+  util.return %file : !hal.file
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/fence_folding.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/fence_folding.mlir
index 2127d89..0bbd190 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/fence_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/fence_folding.mlir

@@ -5,10 +5,10 @@
 // the program to simplify submissions.
 
 // CHECK-LABEL: @fence_create_unused
-func.func @fence_create_unused(%device: !hal.device) {
+util.func public @fence_create_unused(%device: !hal.device) {
   // CHECK-NOT: hal.fence.create
   %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
-  return
+  util.return
 }
 
 // -----
@@ -17,10 +17,10 @@
 
 // CHECK-LABEL: @fence_join_one
 // CHECK-SAME: %[[ARG:.+]]: !hal.fence
-func.func @fence_join_one(%arg: !hal.fence) -> !hal.fence {
+util.func public @fence_join_one(%arg: !hal.fence) -> !hal.fence {
   %join = hal.fence.join at([%arg]) -> !hal.fence
-  // CHECK: return %[[ARG]]
-  return %join : !hal.fence
+  // CHECK: util.return %[[ARG]]
+  util.return %join : !hal.fence
 }
 
 // -----
@@ -28,11 +28,11 @@
 // Tests that a fence join with no operands folds into a util.null.
 
 // CHECK-LABEL: @fence_join_empty
-func.func @fence_join_empty() -> !hal.fence {
+util.func public @fence_join_empty() -> !hal.fence {
   // CHECK: %[[JOIN:.+]] = util.null : !hal.fence
   %join = hal.fence.join at([]) -> !hal.fence
-  // CHECK: return %[[JOIN]]
-  return %join : !hal.fence
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !hal.fence
 }
 
 // -----
@@ -41,13 +41,13 @@
 
 // CHECK-LABEL: @fence_join_null
 // CHECK-SAME: (%[[ARG0:.+]]: !hal.fence, %[[ARG1:.+]]: !hal.fence)
-func.func @fence_join_null(%arg0: !hal.fence, %arg1: !hal.fence) -> !hal.fence {
+util.func public @fence_join_null(%arg0: !hal.fence, %arg1: !hal.fence) -> !hal.fence {
   // CHECK-NOT: util.null
   %null = util.null : !hal.fence
   // CHECK: %[[JOIN:.+]] = hal.fence.join at([%[[ARG0]], %[[ARG1]]]) -> !hal.fence
   %join = hal.fence.join at([%arg0, %null, %arg1]) -> !hal.fence
-  // CHECK: return %[[JOIN]]
-  return %join : !hal.fence
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !hal.fence
 }
 
 // -----
@@ -56,11 +56,11 @@
 
 // CHECK-LABEL: @fence_join_duplicate_fences
 // CHECK-SAME: %[[FENCE0:.+]]: !hal.fence, %[[FENCE1:.+]]: !hal.fence
-func.func @fence_join_duplicate_fences(%fence0: !hal.fence, %fence1: !hal.fence) -> !hal.fence {
+util.func public @fence_join_duplicate_fences(%fence0: !hal.fence, %fence1: !hal.fence) -> !hal.fence {
   // CHECK: %[[JOIN:.+]] = hal.fence.join at([%[[FENCE0]], %[[FENCE1]]]) -> !hal.fence
   %join = hal.fence.join at([%fence0, %fence1, %fence0]) -> !hal.fence
-  // CHECK: return %[[JOIN]]
-  return %join : !hal.fence
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !hal.fence
 }
 
 // -----
@@ -70,19 +70,19 @@
 // is created and the time it is signaled.
 
 // CHECK-LABEL: @fence_elide_signaled
-func.func @fence_elide_signaled(%device: !hal.device) -> !hal.fence {
+util.func public @fence_elide_signaled(%device: !hal.device) -> !hal.fence {
   // CHECK-NOT: hal.fence.create
   %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
   // Ok to have other things inbetween so long as they don't touch the fence.
-  // CHECK: call @external_nop_call
-  call @external_nop_call() : () -> ()
+  // CHECK: util.call @external_nop_call
+  util.call @external_nop_call() : () -> ()
   // CHECK-NOT: hal.fence.signal
   hal.fence.signal<%fence : !hal.fence>
   // CHECK: %[[FENCE:.+]] = util.null : !hal.fence
-  // CHECK: return %[[FENCE]]
-  return %fence : !hal.fence
+  // CHECK: util.return %[[FENCE]]
+  util.return %fence : !hal.fence
 }
-func.func private @external_nop_call()
+util.func private @external_nop_call()
 
 // -----
 
@@ -90,30 +90,30 @@
 // on between when it is created and when it is signaled.
 
 // CHECK-LABEL: @fence_cannot_elide_signaled
-func.func @fence_cannot_elide_signaled(%device: !hal.device) -> !hal.fence {
+util.func public @fence_cannot_elide_signaled(%device: !hal.device) -> !hal.fence {
   // CHECK: hal.fence.create
   %fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
   // Block the elision as the external call may wait on the fence.
-  // CHECK: call @external_wait_call
-  call @external_wait_call(%fence) : (!hal.fence) -> ()
+  // CHECK: util.call @external_wait_call
+  util.call @external_wait_call(%fence) : (!hal.fence) -> ()
   // CHECK: hal.fence.signal
   hal.fence.signal<%fence : !hal.fence>
-  // CHECK: return
-  return %fence : !hal.fence
+  // CHECK: util.return
+  util.return %fence : !hal.fence
 }
-func.func private @external_wait_call(!hal.fence)
+util.func private @external_wait_call(!hal.fence)
 
 // -----
 
 // Tests that awaits with no fences are elided.
 
 // CHECK-LABEL: @fence_await_none
-func.func @fence_await_none() -> i32 {
+util.func public @fence_await_none() -> i32 {
   %timeout = arith.constant 123 : i32
   // CHECK: %[[STATUS:.+]] = arith.constant 0 : i32
   %status = hal.fence.await until([]) timeout_millis(%timeout) : i32
-  // CHECK: return %[[STATUS]]
-  return %status : i32
+  // CHECK: util.return %[[STATUS]]
+  util.return %status : i32
 }
 
 // -----
@@ -122,14 +122,14 @@
 
 // CHECK-LABEL: @fence_await_null
 // CHECK-SAME: %[[ARG:.+]]: !hal.fence
-func.func @fence_await_null(%arg: !hal.fence) -> i32 {
+util.func public @fence_await_null(%arg: !hal.fence) -> i32 {
   %timeout = arith.constant 123 : i32
   // CHECK-NOT: util.null
   %null = util.null : !hal.fence
   // CHECK: %[[STATUS:.+]] = hal.fence.await until([%[[ARG]]])
   %status = hal.fence.await until([%arg, %null]) timeout_millis(%timeout) : i32
-  // CHECK: return %[[STATUS]]
-  return %status : i32
+  // CHECK: util.return %[[STATUS]]
+  util.return %status : i32
 }
 
 // -----
@@ -138,10 +138,10 @@
 
 // CHECK-LABEL: @fence_await_duplicate_fences
 // CHECK-SAME: %[[FENCE0:.+]]: !hal.fence, %[[FENCE1:.+]]: !hal.fence
-func.func @fence_await_duplicate_fences(%fence0: !hal.fence, %fence1: !hal.fence) -> i32 {
+util.func public @fence_await_duplicate_fences(%fence0: !hal.fence, %fence1: !hal.fence) -> i32 {
   %timeout = arith.constant 123 : i32
   // CHECK: %[[STATUS:.+]] = hal.fence.await until([%[[FENCE0]], %[[FENCE1]]])
   %status = hal.fence.await until([%fence0, %fence1, %fence0]) timeout_millis(%timeout) : i32
-  // CHECK: return %[[STATUS]]
-  return %status : i32
+  // CHECK: util.return %[[STATUS]]
+  util.return %status : i32
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/fence_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/fence_ops.mlir
index 65f7d60..59a42cf 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/fence_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/fence_ops.mlir

@@ -1,53 +1,53 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @fence_create
-func.func @fence_create(%arg0: !hal.device) -> !hal.fence {
+util.func public @fence_create(%arg0: !hal.device) -> !hal.fence {
   // CHECK: = hal.fence.create device(%arg0 : !hal.device) flags("None") : !hal.fence
   %fence = hal.fence.create device(%arg0 : !hal.device) flags("None") : !hal.fence
-  return %fence : !hal.fence
+  util.return %fence : !hal.fence
 }
 
 // -----
 
 // CHECK-LABEL: @fence_join
-func.func @fence_join(%arg0: !hal.fence, %arg1: !hal.fence) -> !hal.fence {
+util.func public @fence_join(%arg0: !hal.fence, %arg1: !hal.fence) -> !hal.fence {
   // CHECK: = hal.fence.join at([%arg0, %arg1]) -> !hal.fence
   %fence = hal.fence.join at([%arg0, %arg1]) -> !hal.fence
-  return %fence : !hal.fence
+  util.return %fence : !hal.fence
 }
 
 // -----
 
 // CHECK-LABEL: @fence_query
-func.func @fence_query(%arg0: !hal.fence) -> i32 {
+util.func public @fence_query(%arg0: !hal.fence) -> i32 {
   // CHECK: = hal.fence.query<%arg0 : !hal.fence> : i32
   %status = hal.fence.query<%arg0 : !hal.fence> : i32
-  return %status : i32
+  util.return %status : i32
 }
 
 // -----
 
 // CHECK-LABEL: @fence_signal
-func.func @fence_signal(%arg0: !hal.fence) {
+util.func public @fence_signal(%arg0: !hal.fence) {
   // CHECK: hal.fence.signal<%arg0 : !hal.fence>
   hal.fence.signal<%arg0 : !hal.fence>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @fence_fail
-func.func @fence_fail(%arg0: !hal.fence, %arg1: i32) {
+util.func public @fence_fail(%arg0: !hal.fence, %arg1: i32) {
   // CHECK: hal.fence.fail<%arg0 : !hal.fence> status(%arg1)
   hal.fence.fail<%arg0 : !hal.fence> status(%arg1)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @fence_await
-func.func @fence_await(%arg0: !hal.fence, %arg1: !hal.fence, %arg2: i32) -> i32 {
+util.func public @fence_await(%arg0: !hal.fence, %arg1: !hal.fence, %arg2: i32) -> i32 {
   // CHECK: = hal.fence.await until([%arg0, %arg1]) timeout_millis(%arg2) : i32
   %status = hal.fence.await until([%arg0, %arg1]) timeout_millis(%arg2) : i32
-  return %status : i32
+  util.return %status : i32
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/invalid.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/invalid.mlir
index 742128a..80cea09 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/invalid.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/invalid.mlir

@@ -1,20 +1,20 @@
 // RUN: iree-opt --split-input-file --verify-diagnostics %s
 
 util.global mutable @var : !hal.buffer
-func.func @fn(%arg0: !hal.buffer_view) {
+util.func public @fn(%arg0: !hal.buffer_view) {
   // expected-error @+1 {{global "var" is '!hal.buffer' but store is '!hal.buffer_view'}}
   util.global.store %arg0, @var : !hal.buffer_view
-  return
+  util.return
 }
 
 // -----
 
 util.global mutable @var : !hal.buffer
-func.func @fn(%arg0: !hal.buffer_view) {
+util.func public @fn(%arg0: !hal.buffer_view) {
   %0 = util.global.address @var : !util.ptr<!hal.buffer>
   // expected-error @+1 {{global pointer is '!hal.buffer' but store is '!hal.buffer_view'}}
   util.global.store.indirect %arg0, %0 : !hal.buffer_view -> !util.ptr<!hal.buffer>
-  return
+  util.return
 }
 
 // -----

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_op_folding.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_op_folding.mlir
index f7f295d..131b439 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_op_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_op_folding.mlir

@@ -1,13 +1,13 @@
 // RUN: iree-opt --split-input-file --canonicalize -cse %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @foldTensorImportExport
-func.func @foldTensorImportExport(%arg0: !hal.buffer_view) -> !hal.buffer_view {
+util.func public @foldTensorImportExport(%arg0: !hal.buffer_view) -> !hal.buffer_view {
   // CHECK-NOT: hal.tensor.import
   %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<5xi32>
   // CHECK-NOT: hal.tensor.export
   %1 = hal.tensor.export %0 : tensor<5xi32> -> !hal.buffer_view
-  // CHECK: return %arg0 : !hal.buffer_view
-  return %1 : !hal.buffer_view
+  // CHECK: util.return %arg0 : !hal.buffer_view
+  util.return %1 : !hal.buffer_view
 }
 
 // -----
@@ -18,33 +18,33 @@
 // For now we just don't fold.
 
 // CHECK-LABEL: @foldTensorImportExportTypeMismatch
-func.func @foldTensorImportExportTypeMismatch(%arg0: !hal.buffer_view) -> !hal.buffer {
+util.func public @foldTensorImportExportTypeMismatch(%arg0: !hal.buffer_view) -> !hal.buffer {
   // CHECK: hal.tensor.import
   %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<5xi32>
   // CHECK: hal.tensor.export
   %1 = hal.tensor.export %0 : tensor<5xi32> -> !hal.buffer
-  return %1 : !hal.buffer
+  util.return %1 : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @foldTensorExportImport
-func.func @foldTensorExportImport(%arg0: tensor<5xi32>) -> tensor<5xi32> {
+util.func public @foldTensorExportImport(%arg0: tensor<5xi32>) -> tensor<5xi32> {
   // CHECK-NOT: hal.tensor.export
   %0 = hal.tensor.export %arg0 : tensor<5xi32> -> !hal.buffer_view
   // CHECK-NOT: hal.tensor.import
   %1 = hal.tensor.import %0 : !hal.buffer_view -> tensor<5xi32>
-  // CHECK: return %arg0 : tensor<5xi32>
-  return %1 : tensor<5xi32>
+  // CHECK: util.return %arg0 : tensor<5xi32>
+  util.return %1 : tensor<5xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @DeduplicateTensorBarrierSources
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<5xi32>, %[[ARG1:.+]]: tensor<6xi32>, %[[FENCE:.+]]: !hal.fence)
-func.func @DeduplicateTensorBarrierSources(%arg0: tensor<5xi32>, %arg1: tensor<6xi32>, %fence: !hal.fence) -> (tensor<5xi32>, tensor<6xi32>, tensor<5xi32>) {
+util.func public @DeduplicateTensorBarrierSources(%arg0: tensor<5xi32>, %arg1: tensor<6xi32>, %fence: !hal.fence) -> (tensor<5xi32>, tensor<6xi32>, tensor<5xi32>) {
   // CHECK: %[[RESULTS:.+]]:2 = hal.tensor.barrier join(%[[ARG0]], %[[ARG1]] : tensor<5xi32>, tensor<6xi32>) => %[[FENCE]] : !hal.fence
   %0:3 = hal.tensor.barrier join(%arg0, %arg1, %arg0 : tensor<5xi32>, tensor<6xi32>, tensor<5xi32>) => %fence : !hal.fence
-  // CHECK: return %[[RESULTS]]#0, %[[RESULTS]]#1, %[[RESULTS]]#0 : tensor<5xi32>, tensor<6xi32>, tensor<5xi32>
-  return %0#0, %0#1, %0#2 : tensor<5xi32>, tensor<6xi32>, tensor<5xi32>
+  // CHECK: util.return %[[RESULTS]]#0, %[[RESULTS]]#1, %[[RESULTS]]#0 : tensor<5xi32>, tensor<6xi32>, tensor<5xi32>
+  util.return %0#0, %0#1, %0#2 : tensor<5xi32>, tensor<6xi32>, tensor<5xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir
index 45e1c6f..bcee3c9 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/tensor_ops.mlir

@@ -1,55 +1,55 @@
 // RUN: iree-opt --split-input-file --mlir-print-local-scope %s | iree-opt --split-input-file --mlir-print-local-scope | FileCheck %s
 
 // CHECK-LABEL: @tensorImportStatic
-func.func @tensorImportStatic(%arg0: !hal.buffer_view) -> tensor<5xi32> {
+util.func public @tensorImportStatic(%arg0: !hal.buffer_view) -> tensor<5xi32> {
   // CHECK: hal.tensor.import %arg0 "hello" : !hal.buffer_view -> tensor<5xi32>
   %0 = hal.tensor.import %arg0 "hello" : !hal.buffer_view -> tensor<5xi32>
-  return %0 : tensor<5xi32>
+  util.return %0 : tensor<5xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorImportDynamic
-func.func @tensorImportDynamic(%arg0: !hal.buffer_view, %arg1: index) -> tensor<?x3xi32> {
+util.func public @tensorImportDynamic(%arg0: !hal.buffer_view, %arg1: index) -> tensor<?x3xi32> {
   // CHECK: hal.tensor.import %arg0 : !hal.buffer_view -> tensor<?x3xf32> as tensor<?x3xi32>{%arg1}
   %0 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<?x3xf32> as tensor<?x3xi32>{%arg1}
-  return %0 : tensor<?x3xi32>
+  util.return %0 : tensor<?x3xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorImportAsync
-func.func @tensorImportAsync(%arg0: !hal.buffer_view, %arg1: !hal.fence) -> tensor<5xi32> {
+util.func public @tensorImportAsync(%arg0: !hal.buffer_view, %arg1: !hal.fence) -> tensor<5xi32> {
   // CHECK: hal.tensor.import wait(%arg1) => %arg0 : !hal.buffer_view -> tensor<5xi32>
   %0 = hal.tensor.import wait(%arg1) => %arg0 : !hal.buffer_view -> tensor<5xi32>
-  return %0 : tensor<5xi32>
+  util.return %0 : tensor<5xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorExportDynamic
-func.func @tensorExportDynamic(%arg0: tensor<?x3xi32>, %arg1: index) -> !hal.buffer_view {
+util.func public @tensorExportDynamic(%arg0: tensor<?x3xi32>, %arg1: index) -> !hal.buffer_view {
   // CHECK: hal.tensor.export %arg0 "goodbye" : tensor<?x3xf32> as tensor<?x3xi32>{%arg1} -> !hal.buffer_view
   %0 = hal.tensor.export %arg0 "goodbye" : tensor<?x3xf32> as tensor<?x3xi32>{%arg1} -> !hal.buffer_view
-  return %0 : !hal.buffer_view
+  util.return %0 : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @tensorExportInPlace
-func.func @tensorExportInPlace(%arg0: tensor<?x3xi32>, %arg1: index, %arg2: !hal.buffer) -> !hal.buffer_view {
+util.func public @tensorExportInPlace(%arg0: tensor<?x3xi32>, %arg1: index, %arg2: !hal.buffer) -> !hal.buffer_view {
   // CHECK: hal.tensor.export %arg0 into(%arg2 : !hal.buffer) : tensor<?x3xf32> as tensor<?x3xi32>{%arg1} -> !hal.buffer_view
   %0 = hal.tensor.export %arg0 into(%arg2 : !hal.buffer) : tensor<?x3xf32> as tensor<?x3xi32>{%arg1} -> !hal.buffer_view
-  return %0 : !hal.buffer_view
+  util.return %0 : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @tensorBarrier
-func.func @tensorBarrier(%arg0: tensor<3xf32>, %arg1: tensor<4xf32>, %arg2: !hal.fence) -> (tensor<3xf32>, tensor<4xf32>) {
+util.func public @tensorBarrier(%arg0: tensor<3xf32>, %arg1: tensor<4xf32>, %arg2: !hal.fence) -> (tensor<3xf32>, tensor<4xf32>) {
   // CHECK: :2 = hal.tensor.barrier join(%arg0, %arg1 : tensor<3xf32>, tensor<4xf32>) => %arg2 : !hal.fence
   %0:2 = hal.tensor.barrier join(%arg0, %arg1 : tensor<3xf32>, tensor<4xf32>) => %arg2 : !hal.fence
-  return %0#0, %0#1 : tensor<3xf32>, tensor<4xf32>
+  util.return %0#0, %0#1 : tensor<3xf32>, tensor<4xf32>
 }
 
 // -----
@@ -57,8 +57,8 @@
 // Demonstrates the full functionality of an extern dispatch op.
 // Note that some fields are optional.
 
-// CHECK-LABEL: func.func @dispatchExtern
-func.func @dispatchExtern(%arg0: tensor<4xi32>, %arg1: tensor<8xi32>, %arg2: i32) -> tensor<8xi32> {
+// CHECK-LABEL: util.func public @dispatchExtern
+util.func public @dispatchExtern(%arg0: tensor<4xi32>, %arg1: tensor<8xi32>, %arg2: i32) -> tensor<8xi32> {
   // CHECK-DAG: %[[WORKLOAD_X:.+]] = arith.constant 100
   %workload_x = arith.constant 100 : index
   // CHECK-DAG: %[[WORKLOAD_Y:.+]] = arith.constant 50
@@ -113,6 +113,6 @@
       // CHECK: } ordinal(300) = [#hal.executable.object<{path = "c.o"}>]
       } ordinal(300) = [#hal.executable.object<{path = "c.o"}>]
     })
-  // CHECK: return %[[RESULT]]
-  return %0 : tensor<8xi32>
+  // CHECK: util.return %[[RESULT]]
+  util.return %0 : tensor<8xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/DumpExecutableBenchmarks.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/DumpExecutableBenchmarks.cpp
index 3e7e4b9..25ff8b9 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/DumpExecutableBenchmarks.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/DumpExecutableBenchmarks.cpp

@@ -16,7 +16,6 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
@@ -228,7 +227,8 @@
   // Create an exported benchmark function that runs the dispatches.
   auto funcType =
       moduleBuilder.getFunctionType({moduleBuilder.getI32Type()}, {});
-  auto funcOp = moduleBuilder.create<func::FuncOp>(loc, baseName, funcType);
+  auto funcOp =
+      moduleBuilder.create<IREE::Util::FuncOp>(loc, baseName, funcType);
   funcOp.setVisibility(SymbolTable::Visibility::Public);
 
   // Mark the function as being a dispatch benchmark.
@@ -377,7 +377,7 @@
   funcBuilder.create<IREE::Util::StatusCheckOkOp>(
       loc, fenceOp.getStatus(), "failed to wait on timepoint");
 
-  funcBuilder.create<mlir::func::ReturnOp>(loc);
+  funcBuilder.create<IREE::Util::ReturnOp>(loc);
 }
 
 // Builds a module exporting one function for each dispatch configuration

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeDispatchInstrumentation.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeDispatchInstrumentation.cpp
index 05c3868..2c4a595 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeDispatchInstrumentation.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeDispatchInstrumentation.cpp

@@ -18,7 +18,6 @@
 #include "iree/schemas/instruments/dispatch.h"
 #include "iree/schemas/instruments/dispatch_def_builder.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -310,7 +309,7 @@
     // Create query function for getting the instrumentation data.
     auto listType = moduleBuilder.getType<IREE::Util::ListType>(
         moduleBuilder.getType<IREE::Util::VariantType>());
-    auto queryOp = moduleBuilder.create<func::FuncOp>(
+    auto queryOp = moduleBuilder.create<IREE::Util::FuncOp>(
         loc, "__query_instruments",
         moduleBuilder.getFunctionType({listType}, {}));
     {
@@ -359,7 +358,7 @@
       }
 
       appendListItems(loc, listArg, iovecs, queryBuilder);
-      queryBuilder.create<func::ReturnOp>(loc);
+      queryBuilder.create<IREE::Util::ReturnOp>(loc);
     }
   }
 };

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeResourceCaches.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeResourceCaches.cpp
index 01ba030..e60747a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeResourceCaches.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeResourceCaches.cpp

@@ -13,7 +13,6 @@
 #include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -288,16 +287,16 @@
     auto funcName = (StringRef("__constant_block_") +
                      std::to_string(nextUniqueConstantBlockId++))
                         .str();
-    auto funcOp = moduleBuilder.create<func::FuncOp>(blockOp.getLoc(), funcName,
-                                                     blockOp.getFunctionType());
+    auto funcOp = moduleBuilder.create<IREE::Util::FuncOp>(
+        blockOp.getLoc(), funcName, blockOp.getFunctionType());
     funcOp.setPrivate();
     funcOp.getRegion().takeBody(blockOp.getRegion());
 
     // Replace the hal.return with a func.return.
     for (auto returnOp :
          llvm::make_early_inc_range(funcOp.getOps<IREE::HAL::ReturnOp>())) {
-      OpBuilder(returnOp).create<func::ReturnOp>(returnOp.getLoc(),
-                                                 returnOp.getOperands());
+      OpBuilder(returnOp).create<IREE::Util::ReturnOp>(returnOp.getLoc(),
+                                                       returnOp.getOperands());
       returnOp.erase();
     }
 
@@ -306,8 +305,8 @@
     if (funcOp.getNumArguments() > 0) {
       callOperands.push_back(device);
     }
-    auto callOp = callerBuilder.create<func::CallOp>(blockOp.getLoc(), funcOp,
-                                                     callOperands);
+    auto callOp = callerBuilder.create<IREE::Util::CallOp>(
+        blockOp.getLoc(), funcOp, callOperands);
 
     return llvm::map_to_vector(callOp.getResults(),
                                [](OpResult result) -> Value { return result; });

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.td
index 0c8f376..0480bf8 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.td

@@ -410,7 +410,6 @@
   ];
   let dependentDialects = [
     "mlir::arith::ArithDialect",
-    "mlir::func::FuncDialect",
     "IREE::HAL::HALDialect",
     "IREE::Stream::StreamDialect",
     "IREE::Util::UtilDialect",
@@ -427,7 +426,6 @@
   }];
   let dependentDialects = [
     "mlir::arith::ArithDialect",
-    "mlir::func::FuncDialect",
     "mlir::scf::SCFDialect",
     "IREE::HAL::HALDialect",
     "IREE::Util::UtilDialect",
@@ -511,7 +509,6 @@
   ];
   let dependentDialects = [
     "mlir::arith::ArithDialect",
-    "mlir::func::FuncDialect",
     "mlir::scf::SCFDialect",
     "IREE::HAL::HALDialect",
     "IREE::Util::UtilDialect",

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/convert_to_hal.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/convert_to_hal.mlir
index 5040104..590649e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/convert_to_hal.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/convert_to_hal.mlir

@@ -48,9 +48,9 @@
     }
   }
 
-  // CHECK-LABEL: func.func @simpleDispatch
+  // CHECK-LABEL: util.func public @simpleDispatch
   //  CHECK-SAME: (%[[ARG0:.+]]: !hal.buffer_view, %[[ARG1:.+]]: !hal.buffer_view) -> !hal.buffer_view
-  func.func @simpleDispatch(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
+  util.func public @simpleDispatch(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
     %c1 = arith.constant 1 : index
     %c4 = arith.constant 4 : index
     %c16 = arith.constant 16 : index
@@ -156,8 +156,8 @@
     // CHECK-SAME: type(%[[ELEMENT_TYPE]])
     // CHECK-SAME: encoding(%[[ENCODING_TYPE]])
     %result_view = stream.tensor.export %result_ready : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
-    // CHECK: return
-    return %result_view : !hal.buffer_view
+    // CHECK: util.return
+    util.return %result_view : !hal.buffer_view
   }
 
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/dump_executable_benchmarks.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/dump_executable_benchmarks.mlir
index cdac0d8..c244510 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/dump_executable_benchmarks.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/dump_executable_benchmarks.mlir

@@ -66,7 +66,7 @@
   // CHECK: %[[BUFFER:.+]] = hal.allocator.allocate<%{{.+}} : !hal.allocator> affinity(%{{.+}}) type("DeviceVisible|DeviceLocal") usage("{{.+}}Dispatch{{.+}}") : !hal.buffer{%c768}
   // CHECK-NEXT: util.global.store %[[BUFFER]], @ex0_embedded_elf_x86_64_dispatch0_512_buffer : !hal.buffer
 
-  // CHECK: func.func @ex0_embedded_elf_x86_64_dispatch0_512(%arg0: i32)
+  // CHECK: util.func public @ex0_embedded_elf_x86_64_dispatch0_512(%arg0: i32)
   // CHECK-SAME: attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
   // CHECK: %[[BATCH_SIZE:.+]] = arith.index_cast %arg0 : i32 to index
 
@@ -104,14 +104,14 @@
   // ===========================================================================
 
   // CHECK: util.global private mutable @ex0_embedded_elf_x86_64_dispatch1_512x1_buffer : !hal.buffer
-  // CHECK: func.func @ex0_embedded_elf_x86_64_dispatch1_512x1(%arg0: i32)
+  // CHECK: util.func public @ex0_embedded_elf_x86_64_dispatch1_512x1(%arg0: i32)
   // CHECK:   hal.command_buffer.dispatch.symbol<%{{.+}} : !hal.command_buffer> target(@ex0::@embedded_elf_x86_64::@dispatch1)
 
   // CHECK: util.global private mutable @ex0_embedded_elf_x86_64_dispatch1_128x32_buffer : !hal.buffer
-  // CHECK: func.func @ex0_embedded_elf_x86_64_dispatch1_128x32(%arg0: i32)
+  // CHECK: util.func public @ex0_embedded_elf_x86_64_dispatch1_128x32(%arg0: i32)
   // CHECK:   hal.command_buffer.dispatch.symbol<%{{.+}} : !hal.command_buffer> target(@ex0::@embedded_elf_x86_64::@dispatch1)
 
-  func.func private @main(%dynamic_arg: i32) -> !stream.timepoint {
+  util.func public @main(%dynamic_arg: i32) -> !stream.timepoint {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c32 = arith.constant 32 : index
@@ -170,6 +170,6 @@
       ]}
     } => !stream.timepoint
     %39 = stream.resource.dealloca await(%6) => %result : !stream.resource<transient>{%c128} => !stream.timepoint
-    return %39 : !stream.timepoint
+    util.return %39 : !stream.timepoint
   }
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/elide_redundant_commands.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/elide_redundant_commands.mlir
index 781bda3..861d4e0 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/elide_redundant_commands.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/elide_redundant_commands.mlir

@@ -1,10 +1,10 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-hal-elide-redundant-commands))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(util.func(iree-hal-elide-redundant-commands))' %s | FileCheck %s
 
 // Tests that redundant barriers are elided but barriers gaurding ops are not.
 
 // CHECK-LABEL: @elideRedundantBarriers
 // CHECK-SAME: (%[[CMD:.+]]: !hal.command_buffer, %[[LAYOUT:.+]]: !hal.pipeline_layout)
-func.func @elideRedundantBarriers(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout) {
+util.func public @elideRedundantBarriers(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c42_i32 = arith.constant 42 : i32
@@ -16,14 +16,14 @@
   hal.command_buffer.push_constants<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout) offset(0) values([%c42_i32]) : i32
   // CHECK: hal.command_buffer.execution_barrier
   hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|Transfer|CommandRetire") target("CommandIssue|Dispatch|Transfer") flags("None")
-  // CHECK: return
-  return
+  // CHECK: util.return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @elidePushConstants
-func.func @elidePushConstants(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout) {
+util.func public @elidePushConstants(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout) {
   // CHECK-DAG: %[[C0:.+]] = arith.constant 0
   %c0 = arith.constant 0 : i32
   // CHECK-DAG: %[[C1:.+]] = arith.constant 1
@@ -43,14 +43,14 @@
       layout(%pipeline_layout : !hal.pipeline_layout)
       offset(0)
       values([%c0, %c1]) : i32, i32
-  // CHECK: return
-  return
+  // CHECK: util.return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @elidePushConstantsPrefix
-func.func @elidePushConstantsPrefix(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout) {
+util.func public @elidePushConstantsPrefix(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout) {
   // CHECK-DAG: %[[C0:.+]] = arith.constant 0
   %c0 = arith.constant 0 : i32
   // CHECK-DAG: %[[C1:.+]] = arith.constant 1
@@ -70,14 +70,14 @@
       layout(%pipeline_layout : !hal.pipeline_layout)
       offset(1)
       values([%c1]) : i32
-  // CHECK: return
-  return
+  // CHECK: util.return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @elidePushConstantsSuffix
-func.func @elidePushConstantsSuffix(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout) {
+util.func public @elidePushConstantsSuffix(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout) {
   // CHECK-DAG: %[[C0:.+]] = arith.constant 0
   %c0 = arith.constant 0 : i32
   // CHECK-DAG: %[[C1:.+]] = arith.constant 1
@@ -94,8 +94,8 @@
       layout(%pipeline_layout : !hal.pipeline_layout)
       offset(1)
       values([%c0, %c2]) : i32, i32
-  // CHECK: return
-  return
+  // CHECK: util.return
+  util.return
 }
 
 // -----
@@ -104,7 +104,7 @@
 
 // CHECK-LABEL: @elidePushDescriptorSet
 // CHECK-SAME: (%[[CMD:.+]]: !hal.command_buffer, %[[LAYOUT:.+]]: !hal.pipeline_layout, %[[BUFFER0:.+]]: !hal.buffer, %[[BUFFER1:.+]]: !hal.buffer)
-func.func @elidePushDescriptorSet(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout, %buffer0: !hal.buffer, %buffer1: !hal.buffer) {
+util.func public @elidePushDescriptorSet(%cmd: !hal.command_buffer, %pipeline_layout: !hal.pipeline_layout, %buffer0: !hal.buffer, %buffer1: !hal.buffer) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-DAG: %[[SIZE0:.+]] = arith.constant 100
@@ -124,6 +124,6 @@
     %c0 = (%buffer0 : !hal.buffer)[%c0, %size0],
     %c1 = (%buffer1 : !hal.buffer)[%c0, %size1]
   ])
-  // CHECK: return
-  return
+  // CHECK: util.return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/fixup_legacy_sync.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/fixup_legacy_sync.mlir
index a26a374..1a913ed 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/fixup_legacy_sync.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/fixup_legacy_sync.mlir

@@ -5,10 +5,10 @@
 
 module attributes {hal.device.targets = [#hal.device.target<"vulkan", {legacy_sync}>]} {
 // CHECK-LABEL: @command_buffer_reusable
-func.func @command_buffer_reusable(%arg0: !hal.device) {
+util.func public @command_buffer_reusable(%arg0: !hal.device) {
   // CHECK: hal.command_buffer.create device(%arg0 : !hal.device) mode("None")
   %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode("None") categories("Transfer|Dispatch") : !hal.command_buffer
-  return
+  util.return
 }
 }  // module
 
@@ -18,10 +18,10 @@
 
 module attributes {hal.device.targets = [#hal.device.target<"vulkan", {legacy_sync}>]} {
 // CHECK-LABEL: @command_buffer_oneshot
-func.func @command_buffer_oneshot(%arg0: !hal.device) {
+util.func public @command_buffer_oneshot(%arg0: !hal.device) {
   // CHECK: hal.command_buffer.create device(%arg0 : !hal.device) mode("OneShot|AllowInlineExecution")
   %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
-  return
+  util.return
 }
 }  // module
 
@@ -34,10 +34,10 @@
   #hal.device.target<"vulkan", {}>
 ]} {
 // CHECK-LABEL: @legacy_mode_not_required
-func.func @legacy_mode_not_required(%arg0: !hal.device) {
+util.func public @legacy_mode_not_required(%arg0: !hal.device) {
   // CHECK: hal.command_buffer.create device(%arg0 : !hal.device) mode(OneShot)
   %cmd = hal.command_buffer.create device(%arg0 : !hal.device) mode(OneShot) categories("Transfer|Dispatch") : !hal.command_buffer
-  return
+  util.return
 }
 }  // module
 
@@ -50,7 +50,7 @@
   #hal.device.target<"vulkan", {legacy_sync}>
 ]} {
 // CHECK-LABEL: @mixed_legacy_mode_required
-func.func @mixed_legacy_mode_required(%device: !hal.device, %wait: !hal.fence, %cmd: !hal.command_buffer, %signal: !hal.fence) {
+util.func public @mixed_legacy_mode_required(%device: !hal.device, %wait: !hal.fence, %cmd: !hal.command_buffer, %signal: !hal.fence) {
   %affinity = arith.constant 0 : i64
   // CHECK: hal.fence.await
   // CHECK: hal.device.queue.execute
@@ -59,7 +59,7 @@
       affinity(%affinity)
       wait(%wait) signal(%signal)
       commands([%cmd])
-  return
+  util.return
 }
 }  // module
 
@@ -70,7 +70,7 @@
 module attributes {hal.device.targets = [#hal.device.target<"vulkan", {legacy_sync}>]} {
 // CHECK-LABEL: @blocking_execute
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[WAIT:.+]]: !hal.fence, %[[CMD:.+]]: !hal.command_buffer, %[[SIGNAL:.+]]: !hal.fence)
-func.func @blocking_execute(%device: !hal.device, %wait: !hal.fence, %cmd: !hal.command_buffer, %signal: !hal.fence) {
+util.func public @blocking_execute(%device: !hal.device, %wait: !hal.fence, %cmd: !hal.command_buffer, %signal: !hal.fence) {
   %affinity = arith.constant 0 : i64
   //  CHECK-DAG: %[[NULL:.+]] = util.null : !hal.fence
   //  CHECK-DAG: hal.fence.await until([%[[WAIT]]])
@@ -82,7 +82,7 @@
       affinity(%affinity)
       wait(%wait) signal(%signal)
       commands([%cmd])
-  return
+  util.return
 }
 }  // module
 
@@ -93,7 +93,7 @@
 module attributes {hal.device.targets = [#hal.device.target<"vulkan", {legacy_sync}>]} {
 // CHECK-LABEL: @blocking_execute
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[WAIT:.+]]: !hal.fence, %[[CMD:.+]]: !hal.command_buffer, %[[SIGNAL:.+]]: !hal.fence)
-func.func @blocking_execute(%device: !hal.device, %wait: !hal.fence, %cmd: !hal.command_buffer, %signal: !hal.fence) {
+util.func public @blocking_execute(%device: !hal.device, %wait: !hal.fence, %cmd: !hal.command_buffer, %signal: !hal.fence) {
   // CHECK-NEXT: %[[TIMEOUT:.+]] = arith.constant 100
   %timeout = arith.constant 100 : i32
   // CHECK-NEXT: hal.fence.await until([%[[WAIT]]]) timeout_millis(%[[TIMEOUT]])
@@ -111,7 +111,7 @@
       commands([%cmd])
   // CHECK-NEXT: hal.fence.await until([%[[SIGNAL]]]) timeout_millis(%[[TIMEOUT]])
   hal.fence.await until([%signal]) timeout_millis(%timeout) : i32
-  // CHECK-NEXT: return
-  return
+  // CHECK-NEXT: util.return
+  util.return
 }
 }  // module

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_dispatch_instrumentation.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_dispatch_instrumentation.mlir
index fcb1a75..c51b98f 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_dispatch_instrumentation.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_dispatch_instrumentation.mlir

@@ -17,7 +17,7 @@
   // CHECK:   util.global.store %[[ALLOC_BUFFER]], @__dispatch_instrumentation
 
   // Query function used by tools to get the buffers and metadata:
-  // CHECK: func.func @__query_instruments(%[[LIST:.+]]: !util.list<?>)
+  // CHECK: util.func public @__query_instruments(%[[LIST:.+]]: !util.list<?>)
   // CHECK:   %[[INTERNAL_BUFFER:.+]] = util.global.load @__dispatch_instrumentation
   // CHECK:   %[[EXPORTED_BUFFER:.+]] = stream.tensor.export %[[INTERNAL_BUFFER]]
   // CHECK:   util.list.set %[[LIST]]{{.+}}
@@ -56,7 +56,7 @@
       }
     }
   }
-  func.func @main(%arg0: !stream.resource<external>) -> !stream.resource<external> {
+  util.func public @main(%arg0: !stream.resource<external>) -> !stream.resource<external> {
     %c0 = arith.constant 0 : index
     %c128 = arith.constant 128 : index
     %ret0 = stream.resource.alloc uninitialized : !stream.resource<external>{%c128}
@@ -75,6 +75,6 @@
       }
     } => !stream.timepoint
     %ret0_ready = stream.timepoint.await %timepoint => %ret0 : !stream.resource<external>{%c128}
-    return %ret0_ready : !stream.resource<external>
+    util.return %ret0_ready : !stream.resource<external>
   }
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_interfaces.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_interfaces.mlir
index 0d02221..7697d0d 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_interfaces.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_interfaces.mlir

@@ -48,7 +48,7 @@
       }
     }
   }
-  func.func @main(%arg0: !stream.resource<constant>, %arg1: !stream.resource<transient>, %arg2: index, %arg3: i32) -> !stream.resource<transient> {
+  util.func public @main(%arg0: !stream.resource<constant>, %arg1: !stream.resource<transient>, %arg2: index, %arg3: i32) -> !stream.resource<transient> {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
@@ -67,7 +67,7 @@
       }
     } => !stream.timepoint
     %2 = stream.timepoint.await %1 => %0 : !stream.resource<transient>{%arg2}
-    return %2 : !stream.resource<transient>
+    util.return %2 : !stream.resource<transient>
   }
 }
 

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_resource_caches.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_resource_caches.mlir
index 3701456..d706534 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_resource_caches.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/materialize_resource_caches.mlir

@@ -13,7 +13,7 @@
 // CHECK-NEXT:   util.global.store %[[LAYOUT]], @_descriptor_set_layout_0 : !hal.descriptor_set_layout
 
 // CHECK-LABEL: @descriptorSetLayoutLookup
-func.func @descriptorSetLayoutLookup(%device : !hal.device) -> !hal.descriptor_set_layout {
+util.func public @descriptorSetLayoutLookup(%device : !hal.device) -> !hal.descriptor_set_layout {
   // CHECK-NEXT: %[[LAYOUT:.+]] = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
   %0 = hal.descriptor_set_layout.lookup device(%device : !hal.device)
                                         flags("None")
@@ -21,8 +21,8 @@
     #hal.descriptor_set.binding<0, storage_buffer>,
     #hal.descriptor_set.binding<1, storage_buffer>
   ]) : !hal.descriptor_set_layout
-  // CHECK-NEXT: return %[[LAYOUT]]
-  return %0 : !hal.descriptor_set_layout
+  // CHECK-NEXT: util.return %[[LAYOUT]]
+  util.return %0 : !hal.descriptor_set_layout
 }
 
 // -----
@@ -40,7 +40,7 @@
 // CHECK-NEXT:   util.global.store %[[LAYOUT]], @_pipeline_layout_0 : !hal.pipeline_layout
 
 // CHECK-LABEL: @exeLayoutLookup
-func.func @exeLayoutLookup(%device : !hal.device) -> !hal.pipeline_layout {
+util.func public @exeLayoutLookup(%device : !hal.device) -> !hal.pipeline_layout {
   // CHECK: %[[LAYOUT:.+]] = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
   %0 = hal.pipeline_layout.lookup device(%device : !hal.device)
                                     layout(#hal.pipeline.layout<push_constants = 1, sets = [
@@ -49,8 +49,8 @@
       #hal.descriptor_set.binding<1, storage_buffer>
     ]>
   ]>) : !hal.pipeline_layout
-  // CHECK-NEXT: return %[[LAYOUT]]
-  return %0 : !hal.pipeline_layout
+  // CHECK-NEXT: util.return %[[LAYOUT]]
+  util.return %0 : !hal.pipeline_layout
 }
 
 // -----
@@ -70,7 +70,7 @@
 // CHECK-NEXT:   util.global.store %[[LAYOUT]], @_pipeline_layout_0 : !hal.pipeline_layout
 
 // CHECK-LABEL: @sharedLayoutLookup
-func.func @sharedLayoutLookup(%device : !hal.device) -> !hal.pipeline_layout {
+util.func public @sharedLayoutLookup(%device : !hal.device) -> !hal.pipeline_layout {
   // CHECK: %[[LAYOUT:.+]] = util.global.load @_pipeline_layout_0 : !hal.pipeline_layout
   %0 = hal.pipeline_layout.lookup device(%device : !hal.device)
                                     layout(#hal.pipeline.layout<push_constants = 1, sets = [
@@ -83,12 +83,12 @@
       #hal.descriptor_set.binding<1, uniform_buffer>
     ]>
   ]>) : !hal.pipeline_layout
-  // CHECK-NEXT: return %[[LAYOUT]]
-  return %0 : !hal.pipeline_layout
+  // CHECK-NEXT: util.return %[[LAYOUT]]
+  util.return %0 : !hal.pipeline_layout
 }
 
 // CHECK: @otherDescriptorSetLayoutLookup
-func.func @otherDescriptorSetLayoutLookup(%device : !hal.device) -> !hal.descriptor_set_layout {
+util.func public @otherDescriptorSetLayoutLookup(%device : !hal.device) -> !hal.descriptor_set_layout {
   // CHECK: %[[LAYOUT:.+]] = util.global.load @_descriptor_set_layout_0 : !hal.descriptor_set_layout
   %0 = hal.descriptor_set_layout.lookup device(%device : !hal.device)
                                         flags(None)
@@ -96,8 +96,8 @@
     #hal.descriptor_set.binding<0, storage_buffer>,
     #hal.descriptor_set.binding<1, storage_buffer>
   ]) : !hal.descriptor_set_layout
-  // CHECK-NEXT: return %[[LAYOUT]]
-  return %0 : !hal.descriptor_set_layout
+  // CHECK-NEXT: util.return %[[LAYOUT]]
+  util.return %0 : !hal.descriptor_set_layout
 }
 
 // -----
@@ -180,8 +180,8 @@
 // CHECK:     %[[LAYOUT1:.+]] = util.global.load @_pipeline_layout_1 : !hal.pipeline_layout
 
 // Constant block initializers:
-// CHECK:     %[[CONST_01:.+]]:2 = func.call @__constant_block_0()
-// CHECK:     %[[CONST_2:.+]] = func.call @__constant_block_1(%[[DEVICE]])
+// CHECK:     %[[CONST_01:.+]]:2 = util.call @__constant_block_0()
+// CHECK:     %[[CONST_2:.+]] = util.call @__constant_block_1(%[[DEVICE]])
 
 // Executable creation:
 // CHECK:     %[[EXE:.+]] = hal.executable.create
@@ -202,26 +202,26 @@
 // CHECK:   util.global.store %[[RET]], @_executable_exe : !hal.executable
 
 // Inlined constant block functions (here we ensure all blocks are cloned):
-// CHECK: func.func private @__constant_block_0() -> (i32, i32)
+// CHECK: util.func private @__constant_block_0() -> (i32, i32)
 // CHECK-DAG: %[[C0:.+]] = arith.constant 123
 // CHECK-DAG: %[[C1:.+]] = arith.constant 456
-// CHECK: return %[[C0]], %[[C1]]
-// CHECK: func.func private @__constant_block_1(%[[BLOCK_DEVICE:.+]]: !hal.device) -> i32
+// CHECK: util.return %[[C0]], %[[C1]]
+// CHECK: util.func private @__constant_block_1(%[[BLOCK_DEVICE:.+]]: !hal.device) -> i32
 // CHECK:   %[[OK:.+]], %[[VALUE:.+]] = hal.device.query<%[[BLOCK_DEVICE]] : !hal.device> key("sys" :: "baz")
 // CHECK:   cf.cond_br %[[OK]], ^bb1, ^bb2
 // CHECK: ^bb1:
-// CHECK:   return %[[VALUE]]
+// CHECK:   util.return %[[VALUE]]
 // CHECK: ^bb2:
 // CHECK:   %[[DUMMY:.+]] = arith.constant 0
-// CHECK:   return %[[DUMMY]]
+// CHECK:   util.return %[[DUMMY]]
 
 // CHECK-LABEL: @exeLookup
-func.func @exeLookup(%device : !hal.device) -> !hal.executable {
+util.func public @exeLookup(%device : !hal.device) -> !hal.executable {
   // CHECK: %[[EXE:.+]] = util.global.load @_executable_exe : !hal.executable
   %0 = hal.executable.lookup device(%device : !hal.device)
                              executable(@exe) : !hal.executable
-  // CHECK-NEXT: return %[[EXE]]
-  return %0 : !hal.executable
+  // CHECK-NEXT: util.return %[[EXE]]
+  util.return %0 : !hal.executable
 }
 
 }
@@ -293,11 +293,11 @@
 }
 
 // CHECK-LABEL: @exeLookup
-func.func @exeLookup(%device : !hal.device) -> !hal.executable {
+util.func public @exeLookup(%device : !hal.device) -> !hal.executable {
   // CHECK: %[[EXE:.+]] = util.global.load @_executable_exe : !hal.executable
   %0 = util.global.load @_executable_exe : !hal.executable
-  // CHECK-NEXT: return %[[EXE]]
-  return %0 : !hal.executable
+  // CHECK-NEXT: util.return %[[EXE]]
+  util.return %0 : !hal.executable
 }
 
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/memoize_device_queries.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/memoize_device_queries.mlir
index 1fd8492..5211bd9 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/memoize_device_queries.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/memoize_device_queries.mlir

@@ -18,8 +18,8 @@
 
 // CHECK: util.global private @_device_query_2
 
-// CHECK-LABEL: func.func @device_matchers
-func.func @device_matchers(%device : !hal.device) -> (i1, i1, i1, i1, i1, i1) {
+// CHECK-LABEL: util.func public @device_matchers
+util.func public @device_matchers(%device : !hal.device) -> (i1, i1, i1, i1, i1, i1) {
   // Same queries (same variables):
   // CHECK-NEXT: = util.global.load @_device_query_0_ok : i1
   // CHECK-NEXT: = util.global.load @_device_query_0 : i1
@@ -34,5 +34,5 @@
   // CHECK-NEXT: = util.global.load @_device_query_2 : i1
   %id1_b_ok, %id1_b = hal.device.query<%device : !hal.device> key("hal.device.id" :: "id1") : i1, i1 = true
 
-  return %id0_a_ok, %id0_a, %id0_b_ok, %id0_b, %id1_a, %id1_b : i1, i1, i1, i1, i1, i1
+  util.return %id0_a_ok, %id0_a, %id0_b_ok, %id0_b, %id1_a, %id1_b : i1, i1, i1, i1, i1, i1
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/repeat_dispatches.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/repeat_dispatches.mlir
index 7e60dbc..0d9d21a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/repeat_dispatches.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/repeat_dispatches.mlir

@@ -1,11 +1,11 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-hal-repeat-dispatches{count=2}))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(util.func(iree-hal-repeat-dispatches{count=2}))' %s | FileCheck %s
 
 util.global @_executable : !hal.executable
 
 // CHECK-LABEL: @duplicate_dispatches
 //  CHECK-SAME: (%[[CMD1:.+]]: !hal.command_buffer,
 //  CHECK-SAME:  %[[CMD2:.+]]: !hal.command_buffer)
-func.func @duplicate_dispatches(%cmd1 : !hal.command_buffer, %cmd2 : !hal.command_buffer) {
+util.func public @duplicate_dispatches(%cmd1 : !hal.command_buffer, %cmd2 : !hal.command_buffer) {
   // CHECK: %[[EXE:.+]] = util.global.load @_executable
   %exe = util.global.load @_executable : !hal.executable
 
@@ -19,7 +19,7 @@
   hal.command_buffer.dispatch<%cmd2 : !hal.command_buffer> target(%exe : !hal.executable)[3] workgroups([%c2, %c2, %c2])
   hal.command_buffer.execution_barrier<%cmd2 : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
 
-  return
+  util.return
 }
 
 // CHECK: hal.command_buffer.dispatch<%[[CMD1]] : !hal.command_buffer> target(%[[EXE]] : !hal.executable)[0] workgroups([%c1, %c1, %c1])
@@ -49,7 +49,7 @@
 // CHECK-LABEL: @nested_dispatch
 //  CHECK-SAME: (%[[CMD1:.+]]: !hal.command_buffer,
 //  CHECK-SAME:  %[[IDX:.+]]: index)
-func.func @nested_dispatch(%cmd1 : !hal.command_buffer, %idx : index) {
+util.func public @nested_dispatch(%cmd1 : !hal.command_buffer, %idx : index) {
   // CHECK: %[[EXE:.+]] = util.global.load @_executable
   %exe = util.global.load @_executable : !hal.executable
 
@@ -62,7 +62,7 @@
   default {
   }
 
-  return
+  util.return
 }
 
 // CHECK: scf.index_switch

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/resolve_export_ordinals.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/resolve_export_ordinals.mlir
index 22d6f35..748f48e 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/resolve_export_ordinals.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/resolve_export_ordinals.mlir

@@ -15,7 +15,7 @@
 
 // CHECK-LABEL: @dispatch_with_nested_references
 // CHECK-SAME: %[[CMD:.+]]: !hal.command_buffer
-func.func @dispatch_with_nested_references(%cmd : !hal.command_buffer) {
+util.func public @dispatch_with_nested_references(%cmd : !hal.command_buffer) {
   %c10 = arith.constant 10 : index
   %c11 = arith.constant 11 : index
   %c12 = arith.constant 12 : index
@@ -29,13 +29,13 @@
   hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer>
       target(@exe::@target::@entry)
       workgroups([%c10, %c11, %c12])
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @dispatch_already_using_ordinals
-func.func @dispatch_already_using_ordinals(
+util.func public @dispatch_already_using_ordinals(
   // CHECK-SAME: %[[CMD:.+]]: !hal.command_buffer
   %cmd: !hal.command_buffer,
   // CHECK-SAME: %[[EXE:.+]]: !hal.executable
@@ -50,7 +50,7 @@
   hal.command_buffer.dispatch<%cmd : !hal.command_buffer>
       target(%exe : !hal.executable)[2]
       workgroups([%c10, %c11, %c12])
-  return
+  util.return
 }
 
 // -----
@@ -69,7 +69,7 @@
 }
 
 // CHECK-LABEL: @dispatch_indirect_with_nested_references
-func.func @dispatch_indirect_with_nested_references(
+util.func public @dispatch_indirect_with_nested_references(
   // CHECK-SAME: %[[CMD:.+]]: !hal.command_buffer
   %cmd: !hal.command_buffer,
   // CHECK-SAME: %[[BUF:.+]]: !hal.buffer
@@ -84,13 +84,13 @@
   hal.command_buffer.dispatch.indirect.symbol<%cmd : !hal.command_buffer>
       target(@exe::@target::@entry)
       workgroups(%buf : !hal.buffer)[%c10]
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @dispatch_indirect_already_using_ordinals
-func.func @dispatch_indirect_already_using_ordinals(
+util.func public @dispatch_indirect_already_using_ordinals(
   // CHECK-SAME: %[[CMD:.+]]: !hal.command_buffer
   %cmd: !hal.command_buffer,
   // CHECK-SAME: %[[EXE:.+]]: !hal.executable
@@ -105,5 +105,5 @@
   hal.command_buffer.dispatch.indirect<%cmd : !hal.command_buffer>
       target(%exe : !hal.executable)[0]
       workgroups(%buf : !hal.buffer)[%c10]
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/verify_target_environment.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/verify_target_environment.mlir
index 6d90583..cfa6152 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/verify_target_environment.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/verify_target_environment.mlir

@@ -2,28 +2,28 @@
 
 // expected-error@+1 {{no HAL target devices specified}}
 module @module {
-  func.func private @func() -> ()
+  util.func private @func() -> ()
 }
 
 // -----
 
 // expected-error@+1 {{no HAL target devices specified}}
 module @module attributes {hal.device.targets = []} {
-  func.func private @func() -> ()
+  util.func private @func() -> ()
 }
 
 // -----
 
 // expected-error@+1 {{invalid target attr type}}
 module @module attributes {hal.device.targets = ["wrong_type"]} {
-  func.func private @func() -> ()
+  util.func private @func() -> ()
 }
 
 // -----
 
 // expected-error@+1 {{unregistered target backend "foo"}}
 module @module attributes {hal.device.targets = [#hal.device.target<"foo">]} {
-  func.func private @func() -> ()
+  util.func private @func() -> ()
 }
 
 // -----
@@ -35,7 +35,7 @@
 
 // CHECK: module @module attributes {hal.device.targets = [#device_target_vmvx]}
 module @module attributes {hal.device.targets = [#device_target_vmvx]} {
-  func.func private @func() -> ()
+  util.func private @func() -> ()
 }
 
 // -----

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/call_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/call_ops.mlir
index 37d37a1..d9a425c 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/call_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/call_ops.mlir

@@ -8,14 +8,14 @@
 
 // CHECK-LABEL: @basicCall
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[SIZE0:.+]]: index, %[[DIM0:.+]]: index)
-func.func @basicCall(%arg0: tensor<?xf32>, %dim0: index) -> tensor<?xf32> {
+util.func public @basicCall(%arg0: tensor<?xf32>, %dim0: index) -> tensor<?xf32> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<?xf32>{%[[DIM0]]}
   // CHECK: %[[CALL:.+]] = stream.async.call @basicExtern
   // CHECK-SAME: (%[[ARG0]][%c0 to %[[SIZE0]] for %[[SIZE0]]], %[[DIM0]]) : (!stream.resource<*>{%[[SIZE0]]}, index) -> !stream.resource<*>{%[[RESULT_SIZE]]}
   %call = flow.call @basicExtern(%arg0, %dim0) : (tensor<?xf32>{%dim0}, index) -> tensor<?xf32>{%dim0}
-  // CHECK: return %[[CALL]], %[[RESULT_SIZE]]
-  return %call : tensor<?xf32>
+  // CHECK: util.return %[[CALL]], %[[RESULT_SIZE]]
+  util.return %call : tensor<?xf32>
 }
 
 // -----
@@ -28,12 +28,12 @@
 
 // CHECK-LABEL: @inplaceCall
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[SIZE0:.+]]: index, %[[DIM0:.+]]: index)
-func.func @inplaceCall(%arg0: tensor<?xf32>, %dim0: index) -> tensor<?xf32> {
+util.func public @inplaceCall(%arg0: tensor<?xf32>, %dim0: index) -> tensor<?xf32> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[CALL:.+]] = stream.async.call @inplaceExtern(%[[ARG0]][%c0 to %[[SIZE0]] for %[[SIZE0]]], %[[DIM0]]) : (!stream.resource<*>{%[[SIZE0]]}, index) -> %[[ARG0]]{%[[SIZE0]]}
   %call = flow.call @inplaceExtern(%arg0, %dim0) : (tensor<?xf32>{%dim0}, index) -> %arg0{%dim0}
-  // CHECK: return %[[CALL]], %[[SIZE0]]
-  return %call : tensor<?xf32>
+  // CHECK: util.return %[[CALL]], %[[SIZE0]]
+  util.return %call : tensor<?xf32>
 }
 
 // -----
@@ -46,10 +46,10 @@
 
 // CHECK-LABEL: @inplaceTypeChangeCall
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[SIZE0:.+]]: index, %[[DIM0:.+]]: index)
-func.func @inplaceTypeChangeCall(%arg0: tensor<?x4xf32>, %dim0: index) -> tensor<4x?xi32> {
+util.func public @inplaceTypeChangeCall(%arg0: tensor<?x4xf32>, %dim0: index) -> tensor<4x?xi32> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[CALL:.+]] = stream.async.call @inplaceTypeChangeExtern(%[[ARG0]][%c0 to %[[SIZE0]] for %[[SIZE0]]], %[[DIM0]]) : (!stream.resource<*>{%[[SIZE0]]}, index) -> %[[ARG0]]{%[[SIZE0]]}
   %call = flow.call @inplaceTypeChangeExtern(%arg0, %dim0) : (tensor<?x4xf32>{%dim0}, index) -> %arg0 as tensor<4x?xi32>{%dim0}
-  // CHECK: return %[[CALL]], %[[SIZE0]]
-  return %call : tensor<4x?xi32>
+  // CHECK: util.return %[[CALL]], %[[SIZE0]]
+  util.return %call : tensor<4x?xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/collective_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/collective_ops.mlir
index c44f92e..64a2a65 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/collective_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/collective_ops.mlir

@@ -2,88 +2,88 @@
 
 // CHECK-LABEL: @channel_split
 //  CHECK-SAME: (%[[BASE_CHANNEL:.+]]: !stream.channel)
-func.func @channel_split(%base_channel: !flow.channel) -> !flow.channel {
+util.func public @channel_split(%base_channel: !flow.channel) -> !flow.channel {
   // CHECK-DAG: %[[COLOR:.+]] = arith.constant 100 : index
   %color = arith.constant 100 : index
   // CHECK-DAG: %[[KEY:.+]] = arith.constant 101 : index
   %key = arith.constant 101 : index
   // CHECK: %[[SPLIT_CHANNEL:.+]] = stream.channel.split %[[BASE_CHANNEL]], %[[COLOR]], %[[KEY]] : !stream.channel -> !stream.channel
   %split_channel = flow.channel.split %base_channel, %color, %key : !flow.channel -> !flow.channel
-  // CHECK: return %[[SPLIT_CHANNEL]]
-  return %split_channel : !flow.channel
+  // CHECK: util.return %[[SPLIT_CHANNEL]]
+  util.return %split_channel : !flow.channel
 }
 
 // -----
 
 // CHECK-LABEL: @channel_rank
 //  CHECK-SAME: (%[[CHANNEL:.+]]: !stream.channel)
-func.func @channel_rank(%channel: !flow.channel) -> index {
+util.func public @channel_rank(%channel: !flow.channel) -> index {
   // CHECK: %[[RANK:.+]] = stream.channel.rank %[[CHANNEL]] : index
-  // CHECK: return %[[RANK]] : index
+  // CHECK: util.return %[[RANK]] : index
   %rank = flow.channel.rank %channel : index
-  return %rank : index
+  util.return %rank : index
 }
 
 // -----
 
 // CHECK-LABEL: @channel_count
 //  CHECK-SAME: (%[[CHANNEL:.+]]: !stream.channel)
-func.func @channel_count(%channel: !flow.channel) -> index {
+util.func public @channel_count(%channel: !flow.channel) -> index {
   // CHECK: %[[COUNT:.+]] = stream.channel.count %[[CHANNEL]] : index
-  // CHECK: return %[[COUNT]] : index
+  // CHECK: util.return %[[COUNT]] : index
   %count = flow.channel.count %channel : index
-  return %count : index
+  util.return %count : index
 }
 
 // -----
 
 // CHECK-LABEL: @all_reduce_sum
-func.func @all_reduce_sum(%channel: !flow.channel, %arg0: tensor<2304xf32>) -> tensor<2304xf32> {
+util.func public @all_reduce_sum(%channel: !flow.channel, %arg0: tensor<2304xf32>) -> tensor<2304xf32> {
   // CHECK: stream.tensor.empty : tensor<2304xf32>
   // CHECK: stream.async.collective<all_reduce with sum : f32>
   %0 = flow.tensor.empty : tensor<2304xf32>
   %1 = flow.collective.all_reduce sum, f32, %0, %arg0, %channel : (tensor<2304xf32>, tensor<2304xf32>, !flow.channel) -> tensor<2304xf32>
-  return %1 : tensor<2304xf32>
+  util.return %1 : tensor<2304xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @all_gather
-func.func @all_gather(%channel: !flow.channel, %arg0: tensor<512xf32>) -> tensor<1024xf32> {
+util.func public @all_gather(%channel: !flow.channel, %arg0: tensor<512xf32>) -> tensor<1024xf32> {
   // CHECK: stream.tensor.empty : tensor<1024xf32>
   // CHECK: stream.async.collective<all_gather : f32>
   %0 = flow.tensor.empty : tensor<1024xf32>
   %1 = flow.collective.all_gather f32, %0, %arg0, %channel : (tensor<1024xf32>, tensor<512xf32>, !flow.channel) -> tensor<1024xf32>
-  return %1 : tensor<1024xf32>
+  util.return %1 : tensor<1024xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @all_to_all
-func.func @all_to_all(%channel: !flow.channel, %arg0: tensor<1024xf32>) -> tensor<1024xf32> {
+util.func public @all_to_all(%channel: !flow.channel, %arg0: tensor<1024xf32>) -> tensor<1024xf32> {
   // CHECK: stream.tensor.empty : tensor<1024xf32>
   // CHECK: stream.async.collective<all_to_all : f32>
   %0 = flow.tensor.empty : tensor<1024xf32>
   %1 = flow.collective.all_to_all f32, %0, %arg0, %channel : (tensor<1024xf32>, tensor<1024xf32>, !flow.channel) -> tensor<1024xf32>
-  return %1 : tensor<1024xf32>
+  util.return %1 : tensor<1024xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @reduce_scatter
-func.func @reduce_scatter(%channel: !flow.channel, %arg0: tensor<4x2xf32>) -> tensor<2x2xf32> {
+util.func public @reduce_scatter(%channel: !flow.channel, %arg0: tensor<4x2xf32>) -> tensor<2x2xf32> {
   // CHECK: stream.tensor.empty : tensor<2x2xf32>
   // CHECK: stream.async.collective<reduce_scatter with sum : f32>
   %0 = flow.tensor.empty : tensor<2x2xf32>
   %1 = flow.collective.reduce_scatter sum, f32, %0, %arg0, %channel : (tensor<2x2xf32>, tensor<4x2xf32>, !flow.channel) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  util.return %1 : tensor<2x2xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @send_recv
 // CHECK-SAME: index, %[[SEND:.+]]: index, %[[RECV:.+]]: index)
-func.func @send_recv(%channel: !flow.channel, %arg0: tensor<1024xf32>, %send: index, %recv: index) -> tensor<1024xf32> {
+util.func public @send_recv(%channel: !flow.channel, %arg0: tensor<1024xf32>, %send: index, %recv: index) -> tensor<1024xf32> {
   // CHECK: stream.tensor.empty : tensor<1024xf32>
   // CHECK-DAG: %[[CST_LO_MASK:.+]] = arith.constant 65535 : i32
   // CHECK-DAG: %[[CST_SHIFT16:.+]] = arith.constant 16 : i32
@@ -96,5 +96,5 @@
   // CHECK-SAME: source_target_pair(%[[PARAM]])
   %0 = flow.tensor.empty : tensor<1024xf32>
   %1 = flow.collective.send_recv f32, %0, %arg0, %channel, %send, %recv : (tensor<1024xf32>, tensor<1024xf32>, !flow.channel, index, index) -> tensor<1024xf32>
-  return %1 : tensor<1024xf32>
+  util.return %1 : tensor<1024xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/dispatch_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/dispatch_ops.mlir
index 964a97b..da75704 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/dispatch_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/dispatch_ops.mlir

@@ -2,20 +2,20 @@
 
 // CHECK-LABEL: @dispatchNoWorkload
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM1:.+]]: index, %[[DIM3:.+]]: index)
-func.func @dispatchNoWorkload(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> tensor<?x?x1024xf32> {
+util.func public @dispatchNoWorkload(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> tensor<?x?x1024xf32> {
   //      CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]}
   //      CHECK: %[[RESULT:.+]] = stream.async.dispatch @ex::@entry(%[[INPUT]][%c0 to %[[INPUT_SIZE]] for %[[INPUT_SIZE]]]) :
   // CHECK-SAME:     (!stream.resource<*>{%[[INPUT_SIZE]]}) -> !stream.resource<*>{%[[RESULT_SIZE]]}
   %0 = flow.dispatch @ex::@entry(%input) : (tensor<7x?x24x?xf32>{%dim1, %dim3}) -> tensor<?x?x1024xf32>{%dim1, %dim3}
   // return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<?x?x1024xf32>
+  util.return %0 : tensor<?x?x1024xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @dispatch
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM1:.+]]: index, %[[DIM3:.+]]: index)
-func.func @dispatch(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> tensor<?x?x1024xf32> {
+util.func public @dispatch(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> tensor<?x?x1024xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
@@ -24,14 +24,14 @@
   // CHECK-SAME:     (!stream.resource<*>{%[[INPUT_SIZE]]}) -> !stream.resource<*>{%[[RESULT_SIZE]]}
   %0 = flow.dispatch @ex::@entry[%c1, %c2, %c3](%input) : (tensor<7x?x24x?xf32>{%dim1, %dim3}) -> tensor<?x?x1024xf32>{%dim1, %dim3}
   // return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<?x?x1024xf32>
+  util.return %0 : tensor<?x?x1024xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tiedDispatch
 //  CHECK-SAME: (%[[INPUT0:.+]]: !stream.resource<*>, %[[INPUT0_SIZE:.+]]: index, %[[INPUT1:.+]]: !stream.resource<*>, %[[INPUT1_SIZE:.+]]: index)
-func.func @tiedDispatch(%input0: tensor<i32>, %input1: tensor<2x3xi32>) -> tensor<3x9xi32> {
+util.func public @tiedDispatch(%input0: tensor<i32>, %input1: tensor<2x3xi32>) -> tensor<3x9xi32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
@@ -40,15 +40,15 @@
   %0 = flow.dispatch @ex::@entry0[%c1, %c2, %c3](%input0) : (tensor<i32>) -> tensor<3x9xi32>
   // CHECK: %[[RESULT:.+]] = stream.async.dispatch @ex::@entry1[%c1, %c2, %c3](%[[INPUT1]][%c0 to %[[INPUT1_SIZE]] for %[[INPUT1_SIZE]]], %[[T]][%c0 to %[[T_SIZE]] for %[[T_SIZE]]]) : (!stream.resource<*>{%[[INPUT1_SIZE]]}, !stream.resource<*>{%[[T_SIZE]]}) -> %[[T]]{%[[T_SIZE]]}
   %1 = flow.dispatch @ex::@entry1[%c1, %c2, %c3](%input1, %0) : (tensor<2x3xi32>, tensor<3x9xi32>) -> %0
-  // CHECK: return %[[RESULT]], %[[T_SIZE]] : !stream.resource<*>, index
-  return %1 : tensor<3x9xi32>
+  // CHECK: util.return %[[RESULT]], %[[T_SIZE]] : !stream.resource<*>, index
+  util.return %1 : tensor<3x9xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @dispatchAffinity
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM1:.+]]: index, %[[DIM3:.+]]: index)
-func.func @dispatchAffinity(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> (tensor<?x?x1024xf32>, tensor<?x?x1024xf32>) {
+util.func public @dispatchAffinity(%input: tensor<7x?x24x?xf32>, %dim1: index, %dim3: index) -> (tensor<?x?x1024xf32>, tensor<?x?x1024xf32>) {
   //      CHECK: %[[RESULT0_SIZE:.+]] = stream.tensor.sizeof on(#hal.affinity.queue<[0]>) tensor<?x?x1024xf32>{%[[DIM1]], %[[DIM3]]}
   //      CHECK: %[[RESULT0:.+]] = stream.async.dispatch on(#hal.affinity.queue<[0]>) @ex::@entry0(%[[INPUT]][%c0 to %[[INPUT_SIZE]] for %[[INPUT_SIZE]]])
   %0 = flow.dispatch @ex::@entry0(%input) {
@@ -60,5 +60,5 @@
     stream.affinity = #hal.affinity.queue<[1]>
   } : (tensor<7x?x24x?xf32>{%dim1, %dim3}) -> tensor<?x?x1024xf32>{%dim3, %dim1}
   // return %[[RESULT0]], %[[RESULT0_SIZE]], %[[RESULT1]], %[[RESULT1_SIZE]]
-  return %0, %1 : tensor<?x?x1024xf32>, tensor<?x?x1024xf32>
+  util.return %0, %1 : tensor<?x?x1024xf32>, tensor<?x?x1024xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/executable_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/executable_ops.mlir
index d221ddf..d45d1f5 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/executable_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/executable_ops.mlir

@@ -19,9 +19,9 @@
         flow.return %arg0, %arg0, %arg0 : index, index, index
       }
   builtin.module {
-    // CHECK: func.func @dispatch()
-    func.func @dispatch() {
-      return
+    // CHECK: util.func public @dispatch()
+    util.func public @dispatch() {
+      util.return
     }
   }
 }
@@ -32,13 +32,13 @@
 flow.executable private @rank_0_binding {
   flow.executable.export public @dispatch
   builtin.module {
-    // CHECK: func.func @dispatch(%[[INPUT:.+]]: !stream.binding)
-    func.func @dispatch(%input: !flow.dispatch.tensor<readonly:tensor<i64>>) {
+    // CHECK: util.func public @dispatch(%[[INPUT:.+]]: !stream.binding)
+    util.func public @dispatch(%input: !flow.dispatch.tensor<readonly:tensor<i64>>) {
       // CHECK: %[[SUBSPAN:.+]] = stream.binding.subspan %[[INPUT]][%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<i64>>
       // CHECK: = flow.dispatch.tensor.load %[[SUBSPAN]]
       %tied_input = flow.dispatch.tensor.load %input, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<i64>> -> tensor<i64>
       util.optimization_barrier %tied_input : tensor<i64>
-      return
+      util.return
     }
   }
 }
@@ -49,8 +49,8 @@
 flow.executable private @static_bindings {
   flow.executable.export public @dispatch
   builtin.module {
-    // CHECK: func.func @dispatch(%[[INPUT:.+]]: !stream.binding, %[[OUTPUT:.+]]: !stream.binding)
-    func.func @dispatch(%input: !flow.dispatch.tensor<readonly:tensor<1x4xf32>>, %output: !flow.dispatch.tensor<writeonly:tensor<4xf32>>) {
+    // CHECK: util.func public @dispatch(%[[INPUT:.+]]: !stream.binding, %[[OUTPUT:.+]]: !stream.binding)
+    util.func public @dispatch(%input: !flow.dispatch.tensor<readonly:tensor<1x4xf32>>, %output: !flow.dispatch.tensor<writeonly:tensor<4xf32>>) {
       // CHECK-DAG: %[[TIED_INPUT:.+]] = stream.binding.subspan %[[INPUT]][%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1x4xf32>>
       // CHECK-DAG: %[[TIED_OUTPUT:.+]] = stream.binding.subspan %[[OUTPUT]][%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
       %tied_input = flow.dispatch.tie_shape %input : !flow.dispatch.tensor<readonly:tensor<1x4xf32>>
@@ -60,7 +60,7 @@
       // CHECK: flow.dispatch.tensor.store %[[TILE]], %[[TIED_OUTPUT]]
       %tile = flow.dispatch.tensor.load %tied_input, offsets = [0, 0], sizes = [1, 4], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4xf32>> -> tensor<4xf32>
       flow.dispatch.tensor.store %tile, %tied_output, offsets = [0], sizes = [4], strides = [1] : tensor<4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
-      return
+      util.return
     }
   }
 }
@@ -71,8 +71,8 @@
 flow.executable private @dynamic_bindings {
   flow.executable.export public @dispatch
   builtin.module {
-    // CHECK: func.func @dispatch(%[[DIM:.+]]: index, %[[INPUT:.+]]: !stream.binding, %[[OUTPUT:.+]]: !stream.binding)
-    func.func @dispatch(%dim: index, %input: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>, %output: !flow.dispatch.tensor<writeonly:tensor<?xf32>>) {
+    // CHECK: util.func public @dispatch(%[[DIM:.+]]: index, %[[INPUT:.+]]: !stream.binding, %[[OUTPUT:.+]]: !stream.binding)
+    util.func public @dispatch(%dim: index, %input: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>, %output: !flow.dispatch.tensor<writeonly:tensor<?xf32>>) {
       // CHECK-DAG: %[[TIED_INPUT:.+]] = stream.binding.subspan %[[INPUT]][%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%[[DIM]]}
       // CHECK-DAG: %[[TIED_OUTPUT:.+]] = stream.binding.subspan %[[OUTPUT]][%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%[[DIM]]}
       %tied_input = flow.dispatch.tie_shape %input : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%dim}
@@ -82,7 +82,7 @@
       // CHECK: flow.dispatch.tensor.store %[[TILE]], %[[TIED_OUTPUT]]
       %tile = flow.dispatch.tensor.load %tied_input, offsets = [0, 0], sizes = [1, %dim], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%dim} -> tensor<?xf32>
       flow.dispatch.tensor.store %tile, %tied_output, offsets = [0], sizes = [%dim], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%dim}
-      return
+      util.return
     }
   }
 }
@@ -93,8 +93,8 @@
 flow.executable private @indirect_dynamic_bindings {
   flow.executable.export public @dispatch
   builtin.module {
-    // CHECK: func.func @dispatch(%[[DIM_TENSOR:.+]]: !stream.binding, %[[INPUT:.+]]: !stream.binding, %[[OUTPUT:.+]]: !stream.binding)
-    func.func @dispatch(%dim_tensor: !flow.dispatch.tensor<readonly:tensor<i64>>, %input: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>, %output: !flow.dispatch.tensor<writeonly:tensor<?xf32>>) {
+    // CHECK: util.func public @dispatch(%[[DIM_TENSOR:.+]]: !stream.binding, %[[INPUT:.+]]: !stream.binding, %[[OUTPUT:.+]]: !stream.binding)
+    util.func public @dispatch(%dim_tensor: !flow.dispatch.tensor<readonly:tensor<i64>>, %input: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>, %output: !flow.dispatch.tensor<writeonly:tensor<?xf32>>) {
       // CHECK: %[[DIM_SUBSPAN:.+]] = stream.binding.subspan %[[DIM_TENSOR]][%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<i64>>
       // CHECK: %[[DIM_TILE:.+]] = flow.dispatch.tensor.load %[[DIM_SUBSPAN]]
       // CHECK: %[[DIM_I64:.+]] = tensor.extract %[[DIM_TILE]][] : tensor<i64>
@@ -112,7 +112,7 @@
       // CHECK: flow.dispatch.tensor.store %[[TILE]], %[[TIED_OUTPUT]]
       %tile = flow.dispatch.tensor.load %tied_input, offsets = [0, 0], sizes = [1, %dim], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%dim} -> tensor<?xf32>
       flow.dispatch.tensor.store %tile, %tied_output, offsets = [0], sizes = [%dim], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%dim}
-      return
+      util.return
     }
   }
 }
@@ -123,8 +123,8 @@
 flow.executable private @nested_bindings {
   flow.executable.export public @dispatch
   builtin.module {
-    // CHECK: func.func @dispatch(%[[DIM:.+]]: index, %[[INPUT:.+]]: !stream.binding, %[[OUTPUT:.+]]: !stream.binding)
-    func.func @dispatch(%dim: index, %input: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>, %output: !flow.dispatch.tensor<writeonly:tensor<?xf32>>) {
+    // CHECK: util.func public @dispatch(%[[DIM:.+]]: index, %[[INPUT:.+]]: !stream.binding, %[[OUTPUT:.+]]: !stream.binding)
+    util.func public @dispatch(%dim: index, %input: !flow.dispatch.tensor<readonly:tensor<1x?xf32>>, %output: !flow.dispatch.tensor<writeonly:tensor<?xf32>>) {
       // CHECK-DAG: stream.dispatch.workgroup.size[0] : index
       %workgroup_size_0 = flow.dispatch.workgroup.size[0] : index
       // CHECK-DAG: stream.dispatch.workgroup.id[0] : index
@@ -146,7 +146,7 @@
         %tile = flow.dispatch.tensor.load %tied_input, offsets = [0, %arg3], sizes = [1, %7], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x?xf32>>{%dim} -> tensor<?xf32>
         flow.dispatch.tensor.store %tile, %tied_output, offsets = [%arg3], sizes = [%7], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%dim}
       }
-      return
+      util.return
     }
   }
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
index 2bbd064..7de878f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir

@@ -2,33 +2,33 @@
 
 // CHECK-LABEL: @tensorReshapePassThrough
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index)
-func.func @tensorReshapePassThrough(%input: tensor<5x24x48xf32>) -> tensor<30x2x96xf32> {
+util.func public @tensorReshapePassThrough(%input: tensor<5x24x48xf32>) -> tensor<30x2x96xf32> {
   // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<30x2x96xf32> : index
   // CHECK: %[[RESULT:.+]] = stream.tensor.clone %[[INPUT]] : tensor<5x24x48xf32> in !stream.resource<*>{%[[INPUT_SIZE]]} -> tensor<30x2x96xf32> in !stream.resource<*>{%[[RESULT_SIZE]]}
   %0 = flow.tensor.reshape %input : tensor<5x24x48xf32> -> tensor<30x2x96xf32>
-  // CHECK: return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<30x2x96xf32>
+  // CHECK: util.return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
+  util.return %0 : tensor<30x2x96xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorReshapeWithSingleUse
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index)
-func.func @tensorReshapeWithSingleUse(%input: tensor<5x24x48xf32>) -> tensor<30x2x96xf32> {
+util.func public @tensorReshapeWithSingleUse(%input: tensor<5x24x48xf32>) -> tensor<30x2x96xf32> {
   // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<30x2x96xf32> : index
   // CHECK: %[[RESHAPE:.+]] = stream.tensor.clone %[[INPUT]] : tensor<5x24x48xf32> in !stream.resource<*>{%[[INPUT_SIZE]]} -> tensor<30x2x96xf32> in !stream.resource<*>{%[[RESULT_SIZE]]}
   %0 = flow.tensor.reshape %input : tensor<5x24x48xf32> -> tensor<30x2x96xf32>
   // CHECK: %[[RESULT:.+]] = stream.tensor.clone %[[RESHAPE]] : tensor<30x2x96xf32> in !stream.resource<*>{%[[RESULT_SIZE]]} -> tensor<30x2x96xf32> in !stream.resource<*>{%[[RESULT_SIZE]]}
   %1 = flow.tensor.clone %0 : tensor<30x2x96xf32>
-  // CHECK: return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
-  return %1 : tensor<30x2x96xf32>
+  // CHECK: util.return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
+  util.return %1 : tensor<30x2x96xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorReshapeWithMultipleUses
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index)
-func.func @tensorReshapeWithMultipleUses(%input: tensor<5x24x48xf32>)
+util.func public @tensorReshapeWithMultipleUses(%input: tensor<5x24x48xf32>)
     -> (tensor<60x2x48xf32>, tensor<30x2x96xf32>) {
   // CHECK: %[[T0:.+]] = stream.tensor.clone %[[INPUT]] : tensor<5x24x48xf32> in !stream.resource<*>{%[[INPUT_SIZE]]} -> tensor<5x24x48xf32> in !stream.resource<*>{%[[INPUT_SIZE]]}
   %1 = flow.tensor.clone %input : tensor<5x24x48xf32>
@@ -40,65 +40,65 @@
   // CHECK: %[[T3_SIZE:.+]] = stream.tensor.sizeof tensor<30x2x96xf32> : index
   // CHECK: %[[T3:.+]] = stream.tensor.clone %[[T0]] : tensor<5x24x48xf32> in !stream.resource<*>{%[[INPUT_SIZE]]} -> tensor<30x2x96xf32> in !stream.resource<*>{%[[T3_SIZE]]}
   %4 = flow.tensor.reshape %1 : tensor<5x24x48xf32> -> tensor<30x2x96xf32>
-  // CHECK: return %[[T2]], %[[T1_SIZE]], %[[T3]], %[[T3_SIZE]] : !stream.resource<*>, index, !stream.resource<*>, index
-  return %3, %4 : tensor<60x2x48xf32>, tensor<30x2x96xf32>
+  // CHECK: util.return %[[T2]], %[[T1_SIZE]], %[[T3]], %[[T3_SIZE]] : !stream.resource<*>, index, !stream.resource<*>, index
+  util.return %3, %4 : tensor<60x2x48xf32>, tensor<30x2x96xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorBitCastWithSingleUse
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index)
-func.func @tensorBitCastWithSingleUse(%input: tensor<5x24x48xi8>) -> tensor<30x2x192xi4> {
+util.func public @tensorBitCastWithSingleUse(%input: tensor<5x24x48xi8>) -> tensor<30x2x192xi4> {
   // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<30x2x192xi4> : index
   // CHECK: %[[BITCAST:.+]] = stream.tensor.clone %[[INPUT]] : tensor<5x24x48xi8> in !stream.resource<*>{%[[INPUT_SIZE]]} -> tensor<30x2x192xi4> in !stream.resource<*>{%[[RESULT_SIZE]]}
   %0 = flow.tensor.bitcast %input : tensor<5x24x48xi8> -> tensor<30x2x192xi4>
   // CHECK: %[[RESULT:.+]] = stream.tensor.clone %[[BITCAST]] : tensor<30x2x192xi4> in !stream.resource<*>{%[[RESULT_SIZE]]} -> tensor<30x2x192xi4> in !stream.resource<*>{%[[RESULT_SIZE]]}
   %1 = flow.tensor.clone %0 : tensor<30x2x192xi4>
-  // CHECK: return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
-  return %1 : tensor<30x2x192xi4>
+  // CHECK: util.return %[[RESULT]], %[[RESULT_SIZE]] : !stream.resource<*>, index
+  util.return %1 : tensor<30x2x192xi4>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorAlloca
 //  CHECK-SAME: (%[[DIM0:.+]]: index)
-func.func @tensorAlloca(%dim0: index) -> tensor<?x0xf32> {
+util.func public @tensorAlloca(%dim0: index) -> tensor<?x0xf32> {
   // CHECK: %[[ALLOCA_SIZE:.+]] = stream.tensor.sizeof tensor<?x0xf32>{%[[DIM0]]}
   // CHECK: %[[ALLOCA:.+]] = stream.async.alloca : !stream.resource<*>{%[[ALLOCA_SIZE]]}
   %0 = flow.tensor.alloca : tensor<?x0xf32>{%dim0}
-  // CHECK: return %[[ALLOCA]]
-  return %0 : tensor<?x0xf32>
+  // CHECK: util.return %[[ALLOCA]]
+  util.return %0 : tensor<?x0xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorEmpty
 //  CHECK-SAME: (%[[DIM0:.+]]: index)
-func.func @tensorEmpty(%dim0: index) -> tensor<?x0xf32> {
+util.func public @tensorEmpty(%dim0: index) -> tensor<?x0xf32> {
   // CHECK: %[[EMPTY_SIZE:.+]] = stream.tensor.sizeof tensor<?x0xf32>{%[[DIM0]]}
   // CHECK: %[[EMPTY:.+]] = stream.tensor.empty : tensor<?x0xf32>{%[[DIM0]]} in !stream.resource<*>{%[[EMPTY_SIZE]]}
   %0 = flow.tensor.empty : tensor<?x0xf32>{%dim0}
-  // CHECK: return %[[EMPTY]]
-  return %0 : tensor<?x0xf32>
+  // CHECK: util.return %[[EMPTY]]
+  util.return %0 : tensor<?x0xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorSplat
 //  CHECK-SAME: (%[[VALUE:.+]]: i8, %[[DIM0:.+]]: index)
-func.func @tensorSplat(%value: i8, %dim0: index) -> tensor<?x128xi8> {
+util.func public @tensorSplat(%value: i8, %dim0: index) -> tensor<?x128xi8> {
   // CHECK: %[[T_SIZE:.+]] = stream.tensor.sizeof tensor<?x128xi8>{%[[DIM0]]} : index
   // CHECK: %[[T:.+]] = stream.tensor.splat %[[VALUE]] : i8 -> tensor<?x128xi8>{%[[DIM0]]} in !stream.resource<*>{%[[T_SIZE]]}
   %0 = flow.tensor.splat %value : tensor<?x128xi8>{%dim0}
-  // CHECK: return %[[T]], %[[T_SIZE]]
-  return %0 : tensor<?x128xi8>
+  // CHECK: util.return %[[T]], %[[T_SIZE]]
+  util.return %0 : tensor<?x128xi8>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorSlice
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index)
-func.func @tensorSlice(%input : tensor<5x24x48xf32>) -> tensor<3x24x48xf32> {
+util.func public @tensorSlice(%input : tensor<5x24x48xf32>) -> tensor<3x24x48xf32> {
   %c0 = arith.constant 0 : index
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
@@ -107,28 +107,28 @@
   // CHECK: %[[T_SIZE:.+]] = stream.tensor.sizeof tensor<3x24x48xf32> : index
   // CHECK: %[[T:.+]] = stream.tensor.slice %[[INPUT]][%c2, %c0, %c0 for %c3, %c24, %c48] : tensor<5x24x48xf32> in !stream.resource<*>{%[[INPUT_SIZE]]} -> tensor<3x24x48xf32> in !stream.resource<*>{%[[T_SIZE]]}
   %0 = flow.tensor.slice %input[%c2, %c0, %c0 for %c3, %c24, %c48] : tensor<5x24x48xf32> -> tensor<3x24x48xf32>
-  // CHECK: return %[[T]], %[[T_SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<3x24x48xf32>
+  // CHECK: util.return %[[T]], %[[T_SIZE]] : !stream.resource<*>, index
+  util.return %0 : tensor<3x24x48xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorUpdate
 //  CHECK-SAME: (%[[UPDATE:.+]]: !stream.resource<*>, %[[UPDATE_SIZE:.+]]: index, %[[TARGET:.+]]: !stream.resource<*>, %[[TARGET_SIZE:.+]]: index)
-func.func @tensorUpdate(%update : tensor<1x1x10xf32>, %target : tensor<5x1x10xf32>) -> tensor<5x1x10xf32> {
+util.func public @tensorUpdate(%update : tensor<1x1x10xf32>, %target : tensor<5x1x10xf32>) -> tensor<5x1x10xf32> {
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
   // CHECK: %[[T:.+]] = stream.tensor.update %[[UPDATE]], %[[TARGET]][%c4, %c1, %c1] : tensor<1x1x10xf32> in !stream.resource<*>{%[[UPDATE_SIZE]]} -> tensor<5x1x10xf32> in %[[TARGET]] as !stream.resource<*>{%[[TARGET_SIZE]]}
   %0 = flow.tensor.update %update, %target[%c4, %c1, %c1] : tensor<1x1x10xf32> -> %target as tensor<5x1x10xf32>
-  // CHECK: return %[[T]], %[[TARGET_SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<5x1x10xf32>
+  // CHECK: util.return %[[T]], %[[TARGET_SIZE]] : !stream.resource<*>, index
+  util.return %0 : tensor<5x1x10xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorLoad
 //  CHECK-SAME: (%[[SOURCE:.+]]: !stream.resource<*>, %[[SOURCE_SIZE:.+]]: index)
-func.func @tensorLoad(%source : tensor<2x3xi32>) -> i32 {
+util.func public @tensorLoad(%source : tensor<2x3xi32>) -> i32 {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[T0:.+]] = stream.async.transfer from(#hal.affinity.queue<[0, 1]>) %[[SOURCE]] :
@@ -137,15 +137,15 @@
   %0 = flow.tensor.load %source[%c0, %c1] : tensor<2x3xi32> attributes {
     stream.affinity = #hal.affinity.queue<[0, 1]>
   }
-  // CHECK: return %[[T1]]
-  return %0 : i32
+  // CHECK: util.return %[[T1]]
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @tensorStore
 //  CHECK-SAME: (%[[TARGET:.+]]: !stream.resource<*>, %[[TARGET_SIZE:.+]]: index)
-func.func @tensorStore(%target : tensor<2x3xi32>) -> tensor<2x3xi32> {
+util.func public @tensorStore(%target : tensor<2x3xi32>) -> tensor<2x3xi32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c9 = arith.constant 9 : i32
@@ -158,15 +158,15 @@
   %0 = flow.tensor.store %c9, %target[%c0, %c1] : tensor<2x3xi32> attributes {
     stream.affinity = #hal.affinity.queue<[0, 1]>
   }
-  // CHECK: return %[[T2]]
-  return %0 : tensor<2x3xi32>
+  // CHECK: util.return %[[T2]]
+  util.return %0 : tensor<2x3xi32>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorTrace
 //  CHECK-SAME: (%[[TENSOR0:.+]]: !stream.resource<*>, %[[TENSOR0_SIZE:.+]]: index, %[[TENSOR1:.+]]: !stream.resource<*>, %[[TENSOR1_SIZE:.+]]: index, %[[TENSOR1_DIM0:.+]]: index, %[[TENSOR1_DIM2:.+]]: index)
-func.func @tensorTrace(%tensor0: tensor<5xf32>, %tensor1: tensor<?x3x?xi32>, %tensor1_dim0: index, %tensor1_dim2: index) {
+util.func public @tensorTrace(%tensor0: tensor<5xf32>, %tensor1: tensor<?x3x?xi32>, %tensor1_dim0: index, %tensor1_dim2: index) {
   // CHECK-DAG: %[[TENSOR0_STAGED:.+]] = stream.async.transfer %[[TENSOR0]] : !stream.resource<*>{%[[TENSOR0_SIZE]]} -> !stream.resource<staging>{%[[TENSOR0_SIZE]]}
   // CHECK-DAG: %[[TENSOR1_STAGED:.+]] = stream.async.transfer %[[TENSOR1]] : !stream.resource<*>{%[[TENSOR1_SIZE]]} -> !stream.resource<staging>{%[[TENSOR1_SIZE]]}
   //      CHECK: stream.tensor.trace "FOOBAR" = [
@@ -177,5 +177,5 @@
     %tensor0 : tensor<5xf32>,
     %tensor1 : tensor<?x3x?xi32>{%tensor1_dim0, %tensor1_dim2}
   ]
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir
index 1ffde57..8dc6705 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir

@@ -3,7 +3,7 @@
 // CHECK-LABEL: @importBufferView
 // CHECK-SAME: (%[[VIEW:.+]]: !hal.buffer_view)
 // CHECK-SAME: -> (!stream.resource<*>, index)
-func.func @importBufferView(%view: !hal.buffer_view) -> tensor<?x?x4xf32> {
+util.func public @importBufferView(%view: !hal.buffer_view) -> tensor<?x?x4xf32> {
   //  CHECK-DAG: %[[DIM0:.+]] = hal.buffer_view.dim{{.+}}[0]
   %dim0 = hal.buffer_view.dim<%view : !hal.buffer_view>[0] : index
   //  CHECK-DAG: %[[DIM1:.+]] = hal.buffer_view.dim{{.+}}[1]
@@ -14,23 +14,23 @@
   // CHECK-NEXT: %[[RESULT:.+]] = stream.async.transfer %[[RESOURCE]] :
   // CHECK-SAME:     !stream.resource<external>{%[[SIZE]]} -> !stream.resource<*>{%[[SIZE]]}
   %0 = hal.tensor.import %view : !hal.buffer_view -> tensor<?x?x4xf32>{%dim0, %dim1}
-  // CHECK: return %[[RESULT]], %[[SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<?x?x4xf32>
+  // CHECK: util.return %[[RESULT]], %[[SIZE]] : !stream.resource<*>, index
+  util.return %0 : tensor<?x?x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @importBufferViewBitcasting
 // CHECK-SAME: (%[[VIEW:.+]]: !hal.buffer_view) -> (!stream.resource<*>, index)
-func.func @importBufferViewBitcasting(%view: !hal.buffer_view) -> tensor<4xbf16> {
+util.func public @importBufferViewBitcasting(%view: !hal.buffer_view) -> tensor<4xbf16> {
   //  CHECK-DAG: %[[SIZE:.+]] = stream.tensor.sizeof tensor<4xbf16>
   //      CHECK: %[[RESOURCE:.+]] = stream.tensor.import %[[VIEW]] : !hal.buffer_view ->
   // CHECK-SAME:     tensor<2xui32> in !stream.resource<external>{%[[SIZE]]}
   // CHECK-NEXT: %[[RESULT:.+]] = stream.async.transfer %[[RESOURCE]] :
   // CHECK-SAME:     !stream.resource<external>{%[[SIZE]]} -> !stream.resource<*>{%[[SIZE]]}
   %0 = hal.tensor.import %view : !hal.buffer_view -> tensor<2xui32> as tensor<4xbf16>
-  // CHECK: return %[[RESULT]], %[[SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<4xbf16>
+  // CHECK: util.return %[[RESULT]], %[[SIZE]] : !stream.resource<*>, index
+  util.return %0 : tensor<4xbf16>
 }
 
 // -----
@@ -38,7 +38,7 @@
 // CHECK-LABEL: @importBufferViewAsync
 // CHECK-SAME: (%[[VIEW:.+]]: !hal.buffer_view, %[[FENCE:.+]]: !hal.fence)
 // CHECK-SAME: -> (!stream.resource<*>, index)
-func.func @importBufferViewAsync(%view: !hal.buffer_view, %fence: !hal.fence) -> tensor<4xf32> {
+util.func public @importBufferViewAsync(%view: !hal.buffer_view, %fence: !hal.fence) -> tensor<4xf32> {
   //  CHECK-DAG: %[[SIZE:.+]] = stream.tensor.sizeof tensor<4xf32>
   //      CHECK: %[[ASYNC_RESOURCE:.+]] = stream.tensor.import %[[VIEW]]
   // CHECK-SAME:     : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%[[SIZE]]}
@@ -48,30 +48,30 @@
   // CHECK-NEXT: %[[RESULT:.+]] = stream.async.transfer %[[SYNC_RESOURCE]]
   // CHECK-SAME:     : !stream.resource<external>{%[[SIZE]]} -> !stream.resource<*>{%[[SIZE]]}
   %0 = hal.tensor.import wait(%fence) => %view : !hal.buffer_view -> tensor<4xf32>
-  // CHECK: return %[[RESULT]], %[[SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<4xf32>
+  // CHECK: util.return %[[RESULT]], %[[SIZE]] : !stream.resource<*>, index
+  util.return %0 : tensor<4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @exportBufferView
 // CHECK-SAME: (%[[TENSOR:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
-func.func @exportBufferView(%tensor: tensor<?x?x4xf32>, %dim0: index, %dim1: index) -> !hal.buffer_view {
+util.func public @exportBufferView(%tensor: tensor<?x?x4xf32>, %dim0: index, %dim1: index) -> !hal.buffer_view {
   //      CHECK: %[[VIEW:.+]] = stream.async.transfer %[[TENSOR]] :
   // CHECK-SAME:     !stream.resource<*>{%[[SIZE]]} -> !stream.resource<external>{%[[SIZE]]}
   // CHECK-NEXT: %[[RESULT:.+]] = stream.tensor.export %[[VIEW]] :
   // CHECK-SAME:     tensor<?x?x4xf32>{%[[DIM0]], %[[DIM1]]} in !stream.resource<external>{%[[SIZE]]}
   // CHECK-SAME:     -> !hal.buffer_view
   %0 = hal.tensor.export %tensor : tensor<?x?x4xf32>{%dim0, %dim1} -> !hal.buffer_view
-  // CHECK: return %[[RESULT]]
-  return %0 : !hal.buffer_view
+  // CHECK: util.return %[[RESULT]]
+  util.return %0 : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @exportBufferViewInPlace
 // CHECK-SAME: (%[[TENSOR:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index, %[[STORAGE:.+]]: !hal.buffer)
-func.func @exportBufferViewInPlace(%tensor: tensor<?x?x4xf32>, %dim0: index, %dim1: index, %storage: !hal.buffer) -> !hal.buffer_view {
+util.func public @exportBufferViewInPlace(%tensor: tensor<?x?x4xf32>, %dim0: index, %dim1: index, %storage: !hal.buffer) -> !hal.buffer_view {
   //      CHECK: %[[STORAGE_SIZE:.+]] = stream.tensor.sizeof tensor<?x?x4xf32>{%[[DIM0]], %[[DIM1]]} : index
   // CHECK-NEXT: %[[STORAGE_IMPORT:.+]] = stream.tensor.import %[[STORAGE]]
   // CHECK-SAME:   : !hal.buffer -> tensor<?x?x4xf32>{%[[DIM0]], %[[DIM1]]} in !stream.resource<external>{%[[STORAGE_SIZE]]}
@@ -81,8 +81,8 @@
   // CHECK-SAME:     tensor<?x?x4xf32>{%[[DIM0]], %[[DIM1]]} in !stream.resource<external>{%[[STORAGE_SIZE]]}
   // CHECK-SAME:     -> !hal.buffer_view
   %0 = hal.tensor.export %tensor into(%storage : !hal.buffer) : tensor<?x?x4xf32>{%dim0, %dim1} -> !hal.buffer_view
-  // CHECK: return %[[STORAGE_RESULT]]
-  return %0 : !hal.buffer_view
+  // CHECK: util.return %[[STORAGE_RESULT]]
+  util.return %0 : !hal.buffer_view
 }
 
 // -----
@@ -91,10 +91,10 @@
 
 // CHECK-LABEL: @exportBufferViewInPlaceToView
 // CHECK-SAME: (%[[TENSOR:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index, %[[STORAGE:.+]]: !hal.buffer_view)
-func.func @exportBufferViewInPlaceToView(%tensor: tensor<?x?x4xf32>, %dim0: index, %dim1: index, %storage: !hal.buffer_view) -> !hal.buffer_view {
+util.func public @exportBufferViewInPlaceToView(%tensor: tensor<?x?x4xf32>, %dim0: index, %dim1: index, %storage: !hal.buffer_view) -> !hal.buffer_view {
   //      CHECK: %[[STORAGE_SIZE:.+]] = stream.tensor.sizeof tensor<?x?x4xf32>{%[[DIM0]], %[[DIM1]]} : index
   // CHECK-NEXT: %[[STORAGE_IMPORT:.+]] = stream.tensor.import %[[STORAGE]]
   // CHECK-SAME:   : !hal.buffer_view -> tensor<?x?x4xf32>{%[[DIM0]], %[[DIM1]]} in !stream.resource<external>{%[[STORAGE_SIZE]]}
   %0 = hal.tensor.export %tensor into(%storage : !hal.buffer_view) : tensor<?x?x4xf32>{%dim0, %dim1} -> !hal.buffer_view
-  return %0 : !hal.buffer_view
+  util.return %0 : !hal.buffer_view
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/PatternUtils.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/PatternUtils.cpp
index 93b7e4f..46c1c83 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/PatternUtils.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/PatternUtils.cpp

@@ -11,6 +11,32 @@
 
 namespace mlir::iree_compiler {
 
+void expandResourceOperand(Location loc, Value operand,
+                           SmallVectorImpl<Value> &newOperands,
+                           OpBuilder &builder) {
+  if (llvm::isa<TensorType>(operand.getType())) {
+    auto value = consumeTensorOperand(loc, operand, builder);
+    newOperands.push_back(value.resource);
+    newOperands.push_back(value.resourceSize);
+  } else if (llvm::isa<IREE::Stream::ResourceType>(operand.getType())) {
+    newOperands.push_back(operand);
+    newOperands.push_back(
+        builder.createOrFold<IREE::Stream::ResourceSizeOp>(loc, operand));
+  } else {
+    newOperands.push_back(operand);
+  }
+}
+
+SmallVector<Value> expandResourceOperands(Location loc, ValueRange operands,
+                                          ConversionPatternRewriter &rewriter) {
+  SmallVector<Value> expandedOperands;
+  expandedOperands.reserve(operands.size());
+  for (auto operand : operands) {
+    expandResourceOperand(loc, operand, expandedOperands, rewriter);
+  }
+  return expandedOperands;
+}
+
 ConvertedTensor consumeTensorOperand(Location loc, Value operand,
                                      OpBuilder &builder) {
   auto operandType = operand.getType();

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/PatternUtils.h b/compiler/src/iree/compiler/Dialect/Stream/Conversion/PatternUtils.h
index 0d4ba17..a7a864f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/PatternUtils.h
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/PatternUtils.h

@@ -13,6 +13,13 @@
 
 namespace mlir::iree_compiler {
 
+void expandResourceOperand(Location loc, Value operand,
+                           SmallVectorImpl<Value> &newOperands,
+                           OpBuilder &builder);
+
+SmallVector<Value> expandResourceOperands(Location loc, ValueRange operands,
+                                          ConversionPatternRewriter &rewriter);
+
 // https://reviews.llvm.org/D111620 broke 1->N type expansion during dialect
 // conversion. It inserts unrealized_conversion_casts but then passes the
 // illegal source dialect types for pattern operands, meaning that even though

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/BUILD.bazel
index 17a87b0..e5ab347 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/BUILD.bazel

@@ -28,7 +28,6 @@
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/CMakeLists.txt
index d46e22a..d1462bc 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/CMakeLists.txt

@@ -23,7 +23,6 @@
     LLVMSupport
     MLIRArithDialect
     MLIRControlFlowDialect
-    MLIRFuncDialect
     MLIRFunctionInterfaces
     MLIRIR
     MLIRMemRefDialect

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/ConvertStructuralOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/ConvertStructuralOps.cpp
index 6a1afe7..1ea3b7d 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/ConvertStructuralOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/ConvertStructuralOps.cpp

@@ -13,7 +13,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -26,140 +25,6 @@
 
 namespace {
 
-struct FuncOpSignatureConversion
-    : public OpConversionPattern<mlir::func::FuncOp> {
-  using OpConversionPattern::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(mlir::func::FuncOp funcOp, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    auto &typeConverter = *getTypeConverter();
-
-    // Convert the input signature types.
-    // TODO(benvanik): dynamic shapes by passing in tensor dynamic dims.
-    auto originalType = funcOp.getFunctionType();
-    TypeConverter::SignatureConversion newSignature(
-        originalType.getNumInputs());
-    for (auto argType : llvm::enumerate(originalType.getInputs())) {
-      if (failed(typeConverter.convertSignatureArg(
-              argType.index(), argType.value(), newSignature))) {
-        return failure();
-      }
-    }
-    SmallVector<Type> newResultTypes;
-    if (failed(typeConverter.convertTypes(originalType.getResults(),
-                                          newResultTypes))) {
-      return failure();
-    }
-
-    // Replace function.
-    auto newFuncOp = rewriter.cloneWithoutRegions(funcOp);
-    newFuncOp.getBlocks().clear();
-    rewriter.inlineRegionBefore(funcOp.getFunctionBody(),
-                                newFuncOp.getFunctionBody(), newFuncOp.end());
-    newFuncOp.setType(rewriter.getFunctionType(newSignature.getConvertedTypes(),
-                                               newResultTypes));
-    if (failed(rewriter.convertRegionTypes(&newFuncOp.getFunctionBody(),
-                                           typeConverter, &newSignature))) {
-      return failure();
-    }
-
-    rewriter.eraseOp(funcOp);
-    return success();
-  }
-};
-
-static SmallVector<Value>
-expandResourceOperands(Location loc, ValueRange operands,
-                       ConversionPatternRewriter &rewriter) {
-  SmallVector<Value> expandedOperands;
-  expandedOperands.reserve(operands.size());
-  for (auto operand : operands) {
-    if (llvm::isa<TensorType>(operand.getType())) {
-      auto value = consumeTensorOperand(loc, operand, rewriter);
-      expandedOperands.push_back(value.resource);
-      expandedOperands.push_back(value.resourceSize);
-    } else if (llvm::isa<IREE::Stream::ResourceType>(operand.getType())) {
-      expandedOperands.push_back(operand);
-      expandedOperands.push_back(
-          rewriter.createOrFold<IREE::Stream::ResourceSizeOp>(loc, operand));
-    } else {
-      expandedOperands.push_back(operand);
-    }
-  }
-  return expandedOperands;
-}
-
-struct CallOpConversion : public OpConversionPattern<mlir::func::CallOp> {
-  using OpConversionPattern::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(mlir::func::CallOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    // Expand any resource operands to resource + size.
-    auto expandedOperands =
-        expandResourceOperands(op.getLoc(), adaptor.getOperands(), rewriter);
-
-    // Expand any resource results to resource + size.
-    SmallVector<Type> expandedTypes;
-    struct Result {
-      size_t originalIndex;
-      size_t newIndex;
-      Type newType;
-    };
-    SmallVector<Result> resultMap;
-    for (auto originalType : llvm::enumerate(op.getResultTypes())) {
-      SmallVector<Type> newTypes;
-      if (failed(getTypeConverter()->convertType(originalType.value(),
-                                                 newTypes))) {
-        return rewriter.notifyMatchFailure(op,
-                                           "unable to convert result types");
-      }
-      resultMap.push_back(
-          Result{originalType.index(), expandedTypes.size(), newTypes.front()});
-      expandedTypes.append(newTypes);
-    }
-
-    // Create a new call that takes the expanded input operands and returns the
-    // expanded output results. We can't directly replace the original call as
-    // the result counts differ.
-    auto callOp = rewriter.create<mlir::func::CallOp>(
-        op.getLoc(), expandedTypes, op.getCallee(), expandedOperands);
-
-    // Tie all resource results together so we end up with 1:1 results with the
-    // original op.
-    SmallVector<Value> results;
-    for (auto result : resultMap) {
-      if (llvm::isa<IREE::Stream::ResourceType>(result.newType)) {
-        auto oldType = op.getResult(result.originalIndex).getType();
-        auto resource = callOp.getResult(result.newIndex + 0);
-        auto resourceSize = callOp.getResult(result.newIndex + 1);
-        results.push_back(rewriter
-                              .create<mlir::UnrealizedConversionCastOp>(
-                                  op.getLoc(), TypeRange{oldType},
-                                  ValueRange{resource, resourceSize})
-                              .getResult(0));
-      } else {
-        results.push_back(callOp.getResult(result.newIndex));
-      }
-    }
-    rewriter.replaceOp(op, results);
-
-    return success();
-  }
-};
-
-struct ReturnOpConversion : public OpConversionPattern<mlir::func::ReturnOp> {
-  using OpConversionPattern::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(mlir::func::ReturnOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    // Expand any resource operands to resource + size.
-    auto expandedOperands =
-        expandResourceOperands(op.getLoc(), adaptor.getOperands(), rewriter);
-    rewriter.replaceOpWithNewOp<mlir::func::ReturnOp>(op, expandedOperands);
-    return success();
-  }
-};
-
 struct BranchOpConversion : public OpConversionPattern<mlir::cf::BranchOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
@@ -495,6 +360,19 @@
 
 } // namespace
 
+template <typename OpT>
+static inline void addGenericLegalOp(ConversionTarget &conversionTarget,
+                                     TypeConverter &typeConverter) {
+  conversionTarget.addDynamicallyLegalOp<OpT>([&](OpT op) {
+    return llvm::all_of(
+               op->getOperandTypes(),
+               [&typeConverter](Type t) { return typeConverter.isLegal(t); }) &&
+           llvm::all_of(op->getResultTypes(), [&typeConverter](Type t) {
+             return typeConverter.isLegal(t);
+           });
+  });
+}
+
 void populateStandardStructuralToStreamPatterns(
     MLIRContext *context, ConversionTarget &conversionTarget,
     TypeConverter &typeConverter, RewritePatternSet &patterns) {
@@ -504,82 +382,25 @@
   // dynamic legality checker to force any ops using such types to run through
   // our patterns.
 
-  conversionTarget.addDynamicallyLegalOp<mlir::func::FuncOp>(
-      [&](mlir::func::FuncOp op) {
-        return typeConverter.isSignatureLegal(op.getFunctionType()) &&
-               typeConverter.isLegal(&op.getBody());
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::func::CallOp>(
-      [&](mlir::func::CallOp op) {
-        return llvm::all_of(
-                   op.getOperandTypes(),
-                   [&](Type type) { return typeConverter.isLegal(type); }) &&
-               llvm::all_of(op.getResultTypes(), [&](Type type) {
-                 return typeConverter.isLegal(type);
-               });
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::func::ReturnOp>(
-      [&](mlir::func::ReturnOp op) {
-        return llvm::all_of(op.getOperandTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-
-  conversionTarget.addDynamicallyLegalOp<mlir::cf::BranchOp>(
-      [&](mlir::cf::BranchOp op) {
-        return llvm::all_of(op.getOperandTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::cf::CondBranchOp>(
-      [&](mlir::cf::CondBranchOp op) {
-        return llvm::all_of(op.getOperandTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::cf::SwitchOp>(
-      [&](mlir::cf::SwitchOp op) {
-        return llvm::all_of(op.getOperandTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::scf::IfOp>(
-      [&](mlir::scf::IfOp op) {
-        return llvm::all_of(op.getResultTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::scf::ForOp>(
-      [&](mlir::scf::ForOp op) {
-        return llvm::all_of(op.getResultTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::scf::WhileOp>(
-      [&](mlir::scf::WhileOp op) {
-        return llvm::all_of(op.getResultTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::scf::ConditionOp>(
-      [&](mlir::scf::ConditionOp op) {
-        return llvm::all_of(op.getOperandTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-  conversionTarget.addDynamicallyLegalOp<mlir::scf::YieldOp>(
-      [&](mlir::scf::YieldOp op) {
-        return llvm::all_of(op.getOperandTypes(), [&](Type type) {
-          return typeConverter.isLegal(type);
-        });
-      });
-
+  addGenericLegalOp<mlir::cf::BranchOp>(conversionTarget, typeConverter);
+  addGenericLegalOp<mlir::cf::CondBranchOp>(conversionTarget, typeConverter);
+  addGenericLegalOp<mlir::cf::SwitchOp>(conversionTarget, typeConverter);
   patterns
-      .insert<FuncOpSignatureConversion, CallOpConversion, ReturnOpConversion,
-              BranchOpConversion, CondBranchOpConversion, SwitchOpConversion,
-              SelectOpConversion, ScfConditionOpConversion, ScfIfOpConversion,
-              ScfForOpConversion, ScfWhileOpConversion, ScfYieldOpConversion>(
+      .insert<BranchOpConversion, CondBranchOpConversion, SwitchOpConversion>(
           typeConverter, context);
+
+  addGenericLegalOp<mlir::arith::SelectOp>(conversionTarget, typeConverter);
+  patterns.insert<SelectOpConversion>(typeConverter, context);
+
+  addGenericLegalOp<mlir::scf::IfOp>(conversionTarget, typeConverter);
+  addGenericLegalOp<mlir::scf::ForOp>(conversionTarget, typeConverter);
+  addGenericLegalOp<mlir::scf::WhileOp>(conversionTarget, typeConverter);
+  addGenericLegalOp<mlir::scf::ConditionOp>(conversionTarget, typeConverter);
+  addGenericLegalOp<mlir::scf::YieldOp>(conversionTarget, typeConverter);
+  patterns
+      .insert<ScfConditionOpConversion, ScfIfOpConversion, ScfForOpConversion,
+              ScfWhileOpConversion, ScfYieldOpConversion>(typeConverter,
+                                                          context);
 }
 
 } // namespace mlir::iree_compiler

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/test/constant_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/test/constant_ops.mlir
index e0ed3d5..be63175 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/test/constant_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/test/constant_ops.mlir

@@ -1,19 +1,19 @@
 // RUN: iree-opt --split-input-file --iree-stream-conversion %s | FileCheck %s
 
 // CHECK-LABEL: @constantTensor
-func.func @constantTensor() {
+util.func public @constantTensor() {
   // CHECK: %[[CST:.+]] = stream.tensor.constant : tensor<2xi32> in !stream.resource<constant> = dense<[1, 2]> : tensor<2xi32>
   // CHECK: %[[SIZE:.+]] = stream.resource.size %[[CST]] : !stream.resource<constant>
   // CHECK: %[[T:.+]] = stream.async.transfer %[[CST]] : !stream.resource<constant>{%[[SIZE]]} -> !stream.resource<*>{%[[SIZE]]}
   %0 = arith.constant dense<[1, 2]> : tensor<2xi32>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @emptyTensor
-func.func @emptyTensor() {
+util.func public @emptyTensor() {
   // CHECK: %[[CST:.+]] = stream.tensor.constant : tensor<2x0xi32> in !stream.resource<constant> = dense<> : tensor<2x0xi32>
   %0 = arith.constant dense<> : tensor<2x0xi32>
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/test/structural_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/test/structural_ops.mlir
index 2c5f0c6..1f5293b 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/test/structural_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/StandardToStream/test/structural_ops.mlir

@@ -1,35 +1,15 @@
 // RUN: iree-opt --split-input-file --iree-stream-conversion %s | FileCheck %s
 
-// CHECK-LABEL: @functionExpansion
-//  CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[ARG0_SIZE:.+]]: index,
-//   CHECK-SAME: %[[ARG1:.+]]: i1,
-//  CHECK-SAME:  %[[ARG2:.+]]: !stream.resource<*>, %[[ARG2_SIZE:.+]]: index)
-//  CHECK-SAME: -> (!stream.resource<*>, index, i1, !stream.resource<*>, index)
-func.func @functionExpansion(%arg0: tensor<4x?xf32>, %arg1: i1, %arg2: tensor<i32>)
-    -> (tensor<4x?xf32>, i1, tensor<i32>) {
-  // CHECK-NEXT: %[[RET:.+]]:5 = call @callee(%[[ARG0]], %[[ARG0_SIZE]], %[[ARG1]], %[[ARG2]], %[[ARG2_SIZE]])
-  // CHECK-SAME: : (!stream.resource<*>, index, i1, !stream.resource<*>, index) -> (!stream.resource<*>, index, i1, !stream.resource<*>, index)
-  %0:3 = call @callee(%arg0, %arg1, %arg2) : (tensor<4x?xf32>, i1, tensor<i32>) -> (tensor<4x?xf32>, i1, tensor<i32>)
-  // CHECK: return %[[RET]]#0, %[[RET]]#1, %[[RET]]#2,  %[[RET]]#3, %[[RET]]#4 : !stream.resource<*>, index, i1, !stream.resource<*>, index
-  return %0#0, %0#1, %0#2 : tensor<4x?xf32>, i1, tensor<i32>
-}
-
-// CHECK: func.func private @callee
-func.func private @callee(%arg0: tensor<4x?xf32>, %arg1: i1, %arg2: tensor<i32>)
-    -> (tensor<4x?xf32>, i1, tensor<i32>)
-
-// -----
-
 // CHECK-LABEL: @brExpansion
 //  CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[ARG0_SIZE:.+]]: index, %arg2: i1)
 //  CHECK-SAME: -> (!stream.resource<*>, index, i1)
-func.func @brExpansion(%arg0: tensor<1xf32>, %arg1: i1) -> (tensor<1xf32>, i1) {
+util.func public @brExpansion(%arg0: tensor<1xf32>, %arg1: i1) -> (tensor<1xf32>, i1) {
   // CHECK: cf.br ^bb1(%[[ARG0]], %[[ARG0_SIZE]], %arg2 : !stream.resource<*>, index, i1)
   cf.br ^bb1(%arg0, %arg1 : tensor<1xf32>, i1)
 // CHECK: ^bb1(%[[BB_ARG0:.+]]: !stream.resource<*>, %[[BB_ARG1:.+]]: index, %[[BB_ARG2:.+]]: i1):
 ^bb1(%0: tensor<1xf32>, %1: i1):
-  // CHECK: return %[[BB_ARG0]], %[[BB_ARG1]], %[[BB_ARG2]] : !stream.resource<*>, index, i1
-  return %0, %1 : tensor<1xf32>, i1
+  // CHECK: util.return %[[BB_ARG0]], %[[BB_ARG1]], %[[BB_ARG2]] : !stream.resource<*>, index, i1
+  util.return %0, %1 : tensor<1xf32>, i1
 }
 
 // -----
@@ -38,14 +18,14 @@
 //  CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[ARG0_SIZE:.+]]: index,
 //  CHECK-SAME:  %[[ARG1:.+]]: !stream.resource<*>, %[[ARG1_SIZE:.+]]: index)
 //  CHECK-SAME: -> (!stream.resource<*>, index)
-func.func @condBrExpansion(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+util.func public @condBrExpansion(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   %true = arith.constant 1 : i1
   //      CHECK: cf.cond_br %true,
   // CHECK-SAME:   ^bb1(%[[ARG0]], %[[ARG0_SIZE]] : !stream.resource<*>, index),
   // CHECK-SAME:   ^bb1(%[[ARG1]], %[[ARG1_SIZE]] : !stream.resource<*>, index)
   cf.cond_br %true, ^bb1(%arg0 : tensor<1xf32>), ^bb1(%arg1 : tensor<1xf32>)
 ^bb1(%0: tensor<1xf32>):
-  return %0 : tensor<1xf32>
+  util.return %0 : tensor<1xf32>
 }
 
 // -----
@@ -54,7 +34,7 @@
 //  CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[ARG0_SIZE:.+]]: index,
 //  CHECK-SAME:  %[[ARG1:.+]]: !stream.resource<*>, %[[ARG1_SIZE:.+]]: index)
 //  CHECK-SAME: -> (!stream.resource<*>, index)
-func.func @switchExpansion(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+util.func public @switchExpansion(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   %flag = arith.constant 1 : i32
   //      CHECK: %[[FLAG:.+]] = arith.constant 1 : i32
   //      CHECK: cf.switch %[[FLAG]] : i32, [
@@ -66,9 +46,9 @@
     0: ^bb2(%arg1 : tensor<1xf32>)
   ]
 ^bb1(%0: tensor<1xf32>):
-  return %0 : tensor<1xf32>
+  util.return %0 : tensor<1xf32>
 ^bb2(%1: tensor<1xf32>):
-  return %1 : tensor<1xf32>
+  util.return %1 : tensor<1xf32>
 }
 
 // -----
@@ -78,19 +58,19 @@
 //  CHECK-SAME:  %[[COND:.+]]: i1,
 //  CHECK-SAME:  %[[ARG1:.+]]: !stream.resource<*>, %[[ARG1_SIZE:.+]]: index)
 //  CHECK-SAME: -> (!stream.resource<*>, index)
-func.func @selectExpansion(%arg0: tensor<1xf32>, %cond: i1, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+util.func public @selectExpansion(%arg0: tensor<1xf32>, %cond: i1, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   // CHECK-DAG: %[[RET:.+]] = arith.select %[[COND]], %[[ARG0]], %[[ARG1]] : !stream.resource<*>
   // CHECK-DAG: %[[RET_SIZE:.+]] = arith.select %[[COND]], %[[ARG0_SIZE]], %[[ARG1_SIZE]] : index
   %0 = arith.select %cond, %arg0, %arg1 : tensor<1xf32>
-  // CHECK: return %[[RET]], %[[RET_SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<1xf32>
+  // CHECK: util.return %[[RET]], %[[RET_SIZE]] : !stream.resource<*>, index
+  util.return %0 : tensor<1xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @scfIfExpansion
 // CHECK-SAME: %[[COND:.+]]: i1, %[[ARG0:.+]]: !stream.resource<*>, %[[IDX0:.+]]: index, %[[ARG1:.+]]: !stream.resource<*>, %[[IDX1:.+]]: index
-func.func @scfIfExpansion(%cond: i1, %arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
+util.func public @scfIfExpansion(%cond: i1, %arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   // CHECK: %[[IF:.+]]:2 = scf.if %arg0 -> (!stream.resource<*>, index)
   %0 = scf.if %cond -> tensor<1xf32> {
     // CHECK: scf.yield %[[ARG0]], %[[IDX0]]
@@ -99,15 +79,15 @@
     // CHECK: scf.yield %[[ARG1]], %[[IDX1]]
     scf.yield %arg1 : tensor<1xf32>
   }
-  // CHECK: return %[[IF]]#0, %[[IF]]#1
-  return %0 : tensor<1xf32>
+  // CHECK: util.return %[[IF]]#0, %[[IF]]#1
+  util.return %0 : tensor<1xf32>
 }
 
 // -----
 
 // CHECK-LABEL: @scfWhileExpansion
 // CHECK-SAME: %[[ARG0:.+]]: i32, %[[ARG1:.+]]: !stream.resource<*>, %[[ARG2:.+]]: index
-func.func @scfWhileExpansion(%arg0 : i32, %arg1 : tensor<1xf32>) {
+util.func public @scfWhileExpansion(%arg0 : i32, %arg1 : tensor<1xf32>) {
   %c1 = arith.constant 1 : i32
   %c10 = arith.constant 10 : i32
   // CHECK: scf.while
@@ -124,7 +104,7 @@
   // CHECK: scf.yield %[[V:.+]], %[[ARG1]], %[[ARG2]] : i32, !stream.resource<*>, index
     scf.yield %1, %arg1 : i32, tensor<1xf32>
   }
-  return
+  util.return
 }
 
 // -----
@@ -133,7 +113,7 @@
 // CHECK-SAME: %[[ARG0:.+]]: index,
 // CHECK-SAME: %[[ARG1:.+]]: !stream.resource<*>,
 // CHECK-SAME: %[[ARG2:.+]]: index
-func.func @scfWhileExpansion(%arg0 : index, %arg1 : tensor<1xf32>) {
+util.func public @scfWhileExpansion(%arg0 : index, %arg1 : tensor<1xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
 
@@ -143,5 +123,5 @@
   scf.for %i = %c0 to %arg0 step %c1 iter_args(%arg2 = %arg1) -> (tensor<1xf32>) {
     scf.yield %arg2 : tensor<1xf32>
   }
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/BUILD.bazel
index 0484ee7..4a01cc1 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/BUILD.bazel

@@ -25,7 +25,6 @@
         "//compiler/src/iree/compiler/Dialect/Stream/IR",
         "//compiler/src/iree/compiler/Dialect/Util/Conversion",
         "//compiler/src/iree/compiler/Dialect/Util/IR",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Transforms",

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/CMakeLists.txt
index ff93dbd..e3957bb 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/CMakeLists.txt

@@ -18,7 +18,6 @@
   SRCS
     "Patterns.cpp"
   DEPS
-    MLIRFuncDialect
     MLIRFunctionInterfaces
     MLIRIR
     MLIRTransforms

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/Patterns.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/Patterns.cpp
index c73549f..ad2ba06 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/Patterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/Patterns.cpp

@@ -20,6 +20,122 @@
 namespace {
 
 //===----------------------------------------------------------------------===//
+// Structural ops
+//===----------------------------------------------------------------------===//
+
+struct FuncOpSignatureConversion
+    : public OpConversionPattern<IREE::Util::FuncOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(IREE::Util::FuncOp funcOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto &typeConverter = *getTypeConverter();
+
+    // Replace function and convert the signature for region conversion below.
+    TypeConverter::SignatureConversion newSignature(funcOp.getNumArguments());
+    auto newFuncOp = rewriter.cloneWithoutRegions(funcOp);
+    bool anyFailed = false;
+    newFuncOp.expandSignature(
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          if (failed(typeConverter.convertTypes(type, newTypes))) {
+            anyFailed = true;
+          }
+          if (failed(
+                  typeConverter.convertSignatureArg(i, type, newSignature))) {
+            anyFailed = true;
+          }
+        },
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          if (failed(typeConverter.convertTypes(type, newTypes))) {
+            anyFailed = true;
+          }
+        });
+    if (anyFailed) {
+      return rewriter.notifyMatchFailure(
+          funcOp, "unable to convert argument/result types");
+    }
+    newFuncOp.getBlocks().clear();
+    rewriter.inlineRegionBefore(funcOp.getFunctionBody(),
+                                newFuncOp.getFunctionBody(), newFuncOp.end());
+    if (failed(rewriter.convertRegionTypes(&newFuncOp.getFunctionBody(),
+                                           typeConverter, &newSignature))) {
+      return failure();
+    }
+
+    rewriter.eraseOp(funcOp);
+    return success();
+  }
+};
+
+struct CallOpConversion : public OpConversionPattern<IREE::Util::CallOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(IREE::Util::CallOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Create a new call that takes the expanded input operands and returns the
+    // expanded output results. We can't directly replace the original call as
+    // the result counts differ.
+    struct Result {
+      size_t originalIndex;
+      size_t newIndex;
+      Type newType;
+    };
+    SmallVector<Result> resultMap;
+    bool anyFailed = false;
+    auto callOp = op.cloneAndExpand(
+        [&](unsigned i, Value operand, SmallVectorImpl<Value> &newOperands) {
+          auto adaptorOperand = adaptor.getOperands()[i];
+          expandResourceOperand(op.getLoc(), adaptorOperand, newOperands,
+                                rewriter);
+        },
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          size_t newIndex = newTypes.size();
+          if (failed(getTypeConverter()->convertType(type, newTypes)))
+            anyFailed = true;
+          resultMap.push_back(Result{i, newIndex, newTypes[newIndex]});
+        },
+        rewriter);
+    if (anyFailed) {
+      return rewriter.notifyMatchFailure(op, "unable to convert result types");
+    }
+
+    // Tie all resource results together so we end up with 1:1 results with the
+    // original op.
+    SmallVector<Value> results;
+    for (auto result : resultMap) {
+      if (llvm::isa<IREE::Stream::ResourceType>(result.newType)) {
+        auto oldType = op.getResult(result.originalIndex).getType();
+        auto resource = callOp.getResult(result.newIndex + 0);
+        auto resourceSize = callOp.getResult(result.newIndex + 1);
+        results.push_back(rewriter
+                              .create<mlir::UnrealizedConversionCastOp>(
+                                  op.getLoc(), TypeRange{oldType},
+                                  ValueRange{resource, resourceSize})
+                              .getResult(0));
+      } else {
+        results.push_back(callOp.getResult(result.newIndex));
+      }
+    }
+    rewriter.replaceOp(op, results);
+
+    return success();
+  }
+};
+
+struct ReturnOpConversion : public OpConversionPattern<IREE::Util::ReturnOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(IREE::Util::ReturnOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Expand any resource operands to resource + size.
+    auto expandedOperands =
+        expandResourceOperands(op.getLoc(), adaptor.getOperands(), rewriter);
+    rewriter.replaceOpWithNewOp<IREE::Util::ReturnOp>(op, expandedOperands);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
 // Globals
 //===----------------------------------------------------------------------===//
 
@@ -223,6 +339,10 @@
 void populateUtilToStreamConversionPatterns(MLIRContext *context,
                                             TypeConverter &typeConverter,
                                             RewritePatternSet &patterns) {
+  patterns
+      .insert<FuncOpSignatureConversion, CallOpConversion, ReturnOpConversion>(
+          typeConverter, context);
+
   auto expansionState = std::make_shared<GlobalExpansionState>();
   // TODO(#7432): add indirect global expansion support to streams.
   patterns
@@ -259,8 +379,15 @@
         return success();
       });
 
-  conversionTarget
-      .addLegalOp<IREE::Util::InitializerOp, IREE::Util::ReturnOp>();
+  conversionTarget.addLegalOp<IREE::Util::InitializerOp>();
+  conversionTarget.addDynamicallyLegalOp<IREE::Util::FuncOp>(
+      [&](IREE::Util::FuncOp op) {
+        return typeConverter.isSignatureLegal(op.getFunctionType()) &&
+               typeConverter.isLegal(&op.getBody());
+      });
+  addGenericLegalOp<IREE::Util::CallOp>(conversionTarget, typeConverter);
+  addGenericLegalOp<IREE::Util::ReturnOp>(conversionTarget, typeConverter);
+
   conversionTarget.addDynamicallyLegalOp<IREE::Util::GlobalOp>(
       [&](IREE::Util::GlobalOp op) {
         return typeConverter.isLegal(op.getType()) &&

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/BUILD.bazel
index 888725a..3ed0e2c 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/BUILD.bazel

@@ -18,6 +18,7 @@
         [
             "compiler_hints.mlir",
             "global_ops.mlir",
+            "structural_ops.mlir",
         ],
         include = ["*.mlir"],
     ),

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/CMakeLists.txt
index 3223597..43ad040 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/CMakeLists.txt

@@ -16,6 +16,7 @@
   SRCS
     "compiler_hints.mlir"
     "global_ops.mlir"
+    "structural_ops.mlir"
   TOOLS
     FileCheck
     iree-opt

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/compiler_hints.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/compiler_hints.mlir
index d4ea662..f12a2ad 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/compiler_hints.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/compiler_hints.mlir

@@ -1,11 +1,11 @@
 // RUN: iree-opt --split-input-file --iree-stream-conversion %s | FileCheck %s
 
 // CHECK-LABEL: @optimizationBarrier
-func.func @optimizationBarrier(%arg0: tensor<i32>) -> tensor<i32> {
+util.func public @optimizationBarrier(%arg0: tensor<i32>) -> tensor<i32> {
   // CHECK: stream.async.transfer
   // CHECK: %[[RESOURCE:.*]] = util.optimization_barrier %0
   // CHECK: %[[SIZE:.*]] = stream.resource.size %1 : !stream.resource<*>
-  // CHECK: return %[[RESOURCE]], %[[SIZE]] : !stream.resource<*>, index
+  // CHECK: util.return %[[RESOURCE]], %[[SIZE]] : !stream.resource<*>, index
   %0 = util.optimization_barrier %arg0 : tensor<i32>
-  return %0 : tensor<i32>
+  util.return %0 : tensor<i32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/global_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/global_ops.mlir
index 105a513..34b8bba 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/global_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/global_ops.mlir

@@ -4,7 +4,7 @@
 // CHECK: util.global public mutable @var_i32__size : index
 util.global public mutable @var_i32 : tensor<i32>
 // CHECK-LABEL: @mutableGlobal
-func.func @mutableGlobal() {
+util.func public @mutableGlobal() {
   // CHECK-DAG: %[[VAR:.+]] = util.global.load @var_i32 : !stream.resource<variable>
   // CHECK-DAG: %[[SIZE:.+]] = util.global.load @var_i32__size : index
   //     CHECK: %[[LOAD_T:.+]] = stream.async.transfer %[[VAR]] : !stream.resource<variable>{%[[SIZE]]} -> !stream.resource<*>{%[[SIZE]]}
@@ -13,18 +13,18 @@
   // CHECK-DAG: util.global.store %[[STORE_T]], @var_i32 : !stream.resource<variable>
   // CHECK-DAG: util.global.store %[[SIZE]], @var_i32__size : index
   util.global.store %0, @var_i32 : tensor<i32>
-  return
+  util.return
 }
 
 // -----
 
 // TODO(#7432): add indirect global expansion support to streams.
 // util.global public mutable @var_indirect : tensor<i32>
-// func.func @mutableGlobalIndirect() {
+// util.func public @mutableGlobalIndirect() {
 //   %0 = util.global.address @var_indirect : !util.ptr<tensor<i32>>
 //   %1 = util.global.load.indirect %0 : !util.ptr<tensor<i32>> -> tensor<i32>
 //   util.global.store.indirect %1, %0 : tensor<i32> -> !util.ptr<tensor<i32>>
-//   return
+//   util.return
 // }
 
 // -----
@@ -38,14 +38,14 @@
 //  CHECK-DAG:   util.global.store %[[SIZE]], @var_with_tensor_initializer__size : index
 util.global public mutable @var_with_tensor_initializer = dense<0.000000e+00> : tensor<f32>
 // CHECK-LABEL: @initializedGlobal
-func.func @initializedGlobal() {
+util.func public @initializedGlobal() {
   // CHECK-DAG: = util.global.load @var_with_tensor_initializer : !stream.resource<variable>
   // CHECK-DAG: = util.global.load @var_with_tensor_initializer__size : index
   %0 = util.global.load @var_with_tensor_initializer : tensor<f32>
   // CHECK-DAG: util.global.store %{{.+}}, @var_with_tensor_initializer : !stream.resource<variable>
   // CHECK-DAG: util.global.store %{{.+}}, @var_with_tensor_initializer__size : index
   util.global.store %0, @var_with_tensor_initializer : tensor<f32>
-  return
+  util.return
 }
 
 // -----
@@ -59,14 +59,14 @@
 //  CHECK-DAG:   util.global.store %[[SIZE]], @var_with_tensor_uninitialized__size : index
 util.global private mutable @var_with_tensor_uninitialized = #util.uninitialized : tensor<4xf32>
 // CHECK-LABEL: @uninitializedGlobalTensor
-func.func @uninitializedGlobalTensor() {
+util.func public @uninitializedGlobalTensor() {
   // CHECK-DAG: = util.global.load @var_with_tensor_uninitialized : !stream.resource<variable>
   // CHECK-DAG: = util.global.load @var_with_tensor_uninitialized__size : index
   %0 = util.global.load @var_with_tensor_uninitialized : tensor<4xf32>
   // CHECK-DAG: util.global.store %{{.+}}, @var_with_tensor_uninitialized : !stream.resource<variable>
   // CHECK-DAG: util.global.store %{{.+}}, @var_with_tensor_uninitialized__size : index
   util.global.store %0, @var_with_tensor_uninitialized : tensor<4xf32>
-  return
+  util.return
 }
 
 // -----
@@ -78,7 +78,7 @@
 // CHECK-DAG: util.global public mutable @var_with_buffer_view_store__size : index
 util.global public mutable @var_with_buffer_view_store : tensor<?x4xf32>
 // CHECK-LABEL: @globalStoreFromExternal
-func.func @globalStoreFromExternal(%arg0: !hal.buffer_view) {
+util.func public @globalStoreFromExternal(%arg0: !hal.buffer_view) {
   // CHECK: %[[DIM0:.+]] = hal.buffer_view.dim
   %dim0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
   // CHECK: %[[SIZE:.+]] = stream.tensor.sizeof tensor<?x4xf32>{%[[DIM0]]} : index
@@ -89,7 +89,7 @@
   // CHECK: util.global.store %[[VAR]], @var_with_buffer_view_store : !stream.resource<variable>
   // CHECK: util.global.store %[[SIZE]], @var_with_buffer_view_store__size : index
   util.global.store %0, @var_with_buffer_view_store : tensor<?x4xf32>
-  return
+  util.return
 }
 
 // -----
@@ -99,9 +99,9 @@
 
 // TODO(#7432): add indirect global expansion support to streams.
 // util.global public mutable @var_indirect_with_buffer_view_store : tensor<i32>
-// func.func @globalStoreFromExternalIndirect(%arg0: !hal.buffer_view) {
+// util.func public @globalStoreFromExternalIndirect(%arg0: !hal.buffer_view) {
 //   %0 = util.global.address @var_indirect_with_buffer_view_store : !util.ptr<tensor<i32>>
 //   %1 = hal.tensor.import %arg0 : !hal.buffer_view -> tensor<i32>
 //   util.global.store.indirect %1, %0 : tensor<i32> -> !util.ptr<tensor<i32>>
-//   return
+//   util.return
 // }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/structural_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/structural_ops.mlir
new file mode 100644
index 0000000..c66e9a3
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/UtilToStream/test/structural_ops.mlir

@@ -0,0 +1,21 @@
+// RUN: iree-opt --split-input-file --iree-stream-conversion %s | FileCheck %s
+
+// CHECK-LABEL: @functionExpansion
+//  CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[ARG0_SIZE:.+]]: index,
+//   CHECK-SAME: %[[ARG1:.+]]: i1,
+//  CHECK-SAME:  %[[ARG2:.+]]: !stream.resource<*>, %[[ARG2_SIZE:.+]]: index)
+//  CHECK-SAME: -> (!stream.resource<*>, index, i1, !stream.resource<*>, index)
+util.func private @functionExpansion(%arg0: tensor<4x?xf32>, %arg1: i1, %arg2: tensor<i32>)
+    -> (tensor<4x?xf32>, i1, tensor<i32>) {
+  // CHECK-NEXT: %[[RET:.+]]:5 = util.call @callee(%[[ARG0]], %[[ARG0_SIZE]], %[[ARG1]], %[[ARG2]], %[[ARG2_SIZE]])
+  // CHECK-SAME: : (!stream.resource<*>, index, i1, !stream.resource<*>, index) -> (!stream.resource<*>, index, i1, !stream.resource<*>, index)
+  %0:3 = util.call @callee(%arg0, %arg1, %arg2) : (tensor<4x?xf32>, i1, tensor<i32>) -> (tensor<4x?xf32>, i1, tensor<i32>)
+  // CHECK: util.return %[[RET]]#0, %[[RET]]#1, %[[RET]]#2,  %[[RET]]#3, %[[RET]]#4 : !stream.resource<*>, index, i1, !stream.resource<*>, index
+  util.return %0#0, %0#1, %0#2 : tensor<4x?xf32>, i1, tensor<i32>
+}
+
+// CHECK: util.func private @callee
+util.func private @callee(%arg0: tensor<4x?xf32>, %arg1: i1, %arg2: tensor<i32>)
+    -> (tensor<4x?xf32>, i1, tensor<i32>) {
+  util.return %arg0, %arg1, %arg2 : tensor<4x?xf32>, i1, tensor<i32>
+}

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/async_folding.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/async_folding.mlir
index de8a0f1..1438588 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/async_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/async_folding.mlir

@@ -4,7 +4,7 @@
 // We likely want to clone instead to reduce lifetime of the splats.
 
 // CHECK-LABEL: @SinkSplatsToConsumers
-func.func @SinkSplatsToConsumers(
+util.func private @SinkSplatsToConsumers(
   %arg0: i1, %arg1: i1,
   %arg2: !stream.resource<*>,
   %arg3: !stream.resource<*>,
@@ -37,13 +37,13 @@
   cf.br ^bb4(%3 : !stream.resource<*>)
 // CHECK: ^bb4(
 ^bb4(%arg6: !stream.resource<*>):
-  return %arg6 : !stream.resource<*>
+  util.return %arg6 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @SinkSplatsToCommonAncestorOfConsumersInRegions
-func.func @SinkSplatsToCommonAncestorOfConsumersInRegions(%arg0: i1) -> (!stream.resource<*>, !stream.resource<*>) {
+util.func public @SinkSplatsToCommonAncestorOfConsumersInRegions(%arg0: i1) -> (!stream.resource<*>, !stream.resource<*>) {
   // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
   %c0 = arith.constant 0 : index
   // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
@@ -81,13 +81,13 @@
     %6 = stream.async.dispatch @executable::@dispatch1[%c1, %c2, %c3](%0[%c0 to %c100 for %c100], %1[%c0 to %c100 for %c100]) : (!stream.resource<*>{%c100}, !stream.resource<*>{%c100}) -> !stream.resource<*>{%c100}
     scf.yield %6 : !stream.resource<*>
   }
-  return %4, %3 : !stream.resource<*>, !stream.resource<*>
+  util.return %4, %3 : !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @SplatAlreadyAtSinkLocation
-func.func @SplatAlreadyAtSinkLocation(
+util.func private @SplatAlreadyAtSinkLocation(
   %arg0: i1, %arg1: i1,
   %arg2: !stream.resource<*>,
   %arg3: !stream.resource<*>,
@@ -120,13 +120,13 @@
   cf.br ^bb3(%3 : !stream.resource<*>)
 // CHECK: ^bb3(
 ^bb3(%arg6: !stream.resource<*>):
-  return %arg6 : !stream.resource<*>
+  util.return %arg6 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @PropagateClonableOps
-func.func @PropagateClonableOps(%arg0: index) -> !stream.resource<*> {
+util.func private @PropagateClonableOps(%arg0: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
@@ -134,38 +134,38 @@
   %0 = stream.async.splat %c123_i32 : i32 -> !stream.resource<*>{%arg0}
   // CHECK-NOT: stream.async.clone
   %1 = stream.async.clone %0 : !stream.resource<*>{%arg0} -> !stream.resource<*>{%arg0}
-  // CHECK: return %[[T]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[T]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @ConvertSplatConstantsIntoSplats
-func.func @ConvertSplatConstantsIntoSplats(%arg0: index) -> (!stream.resource<transient>, !stream.resource<transient>) {
+util.func private @ConvertSplatConstantsIntoSplats(%arg0: index) -> (!stream.resource<transient>, !stream.resource<transient>) {
   // CHECK: %[[CST:.+]] = arith.constant 3 : i32
   // CHECK: = stream.async.constant : !stream.resource<transient>{%arg0} = dense<[1, 2, 3, 4, 5, 6, 7, 8]> : tensor<8xi32>
   %0 = stream.async.constant : !stream.resource<transient>{%arg0} = dense<[1, 2, 3, 4, 5, 6, 7, 8]> : tensor<8xi32>
   // CHECK-NOT: = stream.async.constant : !stream.resource<transient>{%arg0} = dense<[3]> : tensor<8xi32>
   // CHECK: = stream.async.splat %[[CST]] : i32 -> !stream.resource<transient>{%arg0}
   %1 = stream.async.constant : !stream.resource<transient>{%arg0} = dense<3> : tensor<8xi32>
-  return %0, %1 : !stream.resource<transient>, !stream.resource<transient>
+  util.return %0, %1 : !stream.resource<transient>, !stream.resource<transient>
 }
 
 // -----
 
 // CHECK-LABEL: @FoldAsyncSliceOp
-func.func @FoldAsyncSliceOp(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @FoldAsyncSliceOp(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: stream.async.slice
   %0 = stream.async.slice %arg0[%c0 to %arg1] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%arg1}
-  // CHECK: return %arg0
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %arg0
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @PropagateSplatsThroughSlices
-func.func @PropagateSplatsThroughSlices(%arg0: index) -> !stream.resource<*> {
+util.func private @PropagateSplatsThroughSlices(%arg0: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
@@ -173,38 +173,38 @@
   %0 = stream.async.splat %c123_i32 : i32 -> !stream.resource<*>{%arg0}
   // CHECK-NOT: stream.async.slice
   %1 = stream.async.slice %0[%c0 to %c128] : !stream.resource<*>{%arg0} -> !stream.resource<*>{%c128}
-  // CHECK: return %[[T]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[T]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @FlattenFullFillToSplat
-func.func @FlattenFullFillToSplat(%arg0: !stream.resource<*>, %arg1: index, %arg2: i32) -> !stream.resource<*> {
+util.func private @FlattenFullFillToSplat(%arg0: !stream.resource<*>, %arg1: index, %arg2: i32) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[T:.+]] = stream.async.splat %arg2 : i32 -> !stream.resource<*>{%arg1}
   %0 = stream.async.fill %arg2, %arg0[%c0 to %arg1 for %arg1] : i32 -> %arg0 as !stream.resource<*>{%arg1}
-  // CHECK: return %[[T]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[T]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @ElideRedundantFill
-func.func @ElideRedundantFill(%arg0: !stream.resource<*>, %arg1: index, %arg2: i32) -> !stream.resource<*> {
+util.func private @ElideRedundantFill(%arg0: !stream.resource<*>, %arg1: index, %arg2: i32) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[T:.+]] = stream.async.splat %arg2 : i32 -> !stream.resource<*>{%arg1}
   %0 = stream.async.splat %arg2 : i32 -> !stream.resource<*>{%arg1}
   // CHECK-NOT: stream.async.fill
   %1 = stream.async.fill %arg2, %0[%c0 to %arg1 for %arg1] : i32 -> %0 as !stream.resource<*>{%arg1}
-  // CHECK: return %[[T]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[T]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @ElideRedundantFillBitPatterns
-func.func @ElideRedundantFillBitPatterns(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @ElideRedundantFillBitPatterns(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[CD_I8:.+]] = arith.constant -51 : i8
   %cCDCD_i16 = arith.constant 0xCDCD : i16
@@ -213,14 +213,14 @@
   %0 = stream.async.splat %cCDCDCDCD_i32 : i32 -> !stream.resource<*>{%arg1}
   // CHECK-NOT: stream.async.fill
   %1 = stream.async.fill %cCDCD_i16, %0[%c0 to %arg1 for %arg1] : i16 -> %0 as !stream.resource<*>{%arg1}
-  // CHECK: return %[[T]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[T]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @CoalesceAdjacentFills
-func.func @CoalesceAdjacentFills(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @CoalesceAdjacentFills(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
   %c12 = arith.constant 12 : index
@@ -235,25 +235,25 @@
   %2 = stream.async.fill %c0_i8, %1[%c12 to %c16 for %c4] : i8 -> %1 as !stream.resource<*>{%arg1}
   // CHECK: %[[FILL_1:.+]] = stream.async.fill %c1_i8, %[[FILL_0]][%c16 to %c20 for %c4] : i8 -> %[[FILL_0]] as !stream.resource<*>{%arg1}
   %3 = stream.async.fill %c1_i8, %2[%c16 to %c20 for %c4] : i8 -> %2 as !stream.resource<*>{%arg1}
-  // CHECK: return %[[FILL_1]]
-  return %3 : !stream.resource<*>
+  // CHECK: util.return %[[FILL_1]]
+  util.return %3 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @FoldAsyncUpdateOp
-func.func @FoldAsyncUpdateOp(%arg0: !stream.resource<*>, %arg1: !stream.resource<*>, %arg2: index) -> !stream.resource<*> {
+util.func private @FoldAsyncUpdateOp(%arg0: !stream.resource<*>, %arg1: !stream.resource<*>, %arg2: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: stream.async.update
   %0 = stream.async.update %arg1, %arg0[%c0 to %arg2] : !stream.resource<*>{%arg2} -> %arg0 as !stream.resource<*>{%arg2}
-  // CHECK: return %arg1
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %arg1
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @CombineSplatUpdateFromToFill
-func.func @CombineSplatUpdateFromToFill(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @CombineSplatUpdateFromToFill(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
@@ -261,28 +261,28 @@
   %0 = stream.async.splat %c123_i32 : i32 -> !stream.resource<*>{%c128}
   // CHECK: %[[T:.+]] = stream.async.fill %c123_i32, %arg0[%c0 to %c128 for %c128] : i32 -> %arg0 as !stream.resource<*>{%arg1}
   %1 = stream.async.update %0, %arg0[%c0 to %c128] : !stream.resource<*>{%c128} -> %arg0 as !stream.resource<*>{%arg1}
-  // CHECK: return %[[T]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[T]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @CombineSliceUpdateFromToCopy
-func.func @CombineSliceUpdateFromToCopy(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index) -> !stream.resource<*> {
+util.func private @CombineSliceUpdateFromToCopy(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK-NOT: stream.async.slice
   %0 = stream.async.slice %arg0[%c0 to %c128] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%c128}
   // CHECK: %[[T:.+]] = stream.async.copy %arg0[%c0 to %c128], %arg2[%c0 to %c128], %c128 : !stream.resource<*>{%arg1} -> %arg2 as !stream.resource<*>{%arg3}
   %1 = stream.async.update %0, %arg2[%c0 to %c128] : !stream.resource<*>{%c128} -> %arg2 as !stream.resource<*>{%arg3}
-  // CHECK: return %[[T]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[T]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @AsyncCopyFullSourceToUpdate
-func.func @AsyncCopyFullSourceToUpdate(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index) -> (!stream.resource<*>, !stream.resource<*>) {
+util.func private @AsyncCopyFullSourceToUpdate(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index) -> (!stream.resource<*>, !stream.resource<*>) {
   %c0 = arith.constant 0 : index
   %c8 = arith.constant 8 : index
   %c16 = arith.constant 16 : index
@@ -295,69 +295,69 @@
   // CHECK: = stream.async.copy %arg2[%c16 to %arg3], %arg0[%c0 to %arg3], %c8 : !stream.resource<*>{%arg3} -> %arg0 as !stream.resource<*>{%arg1}
   %1 = stream.async.copy %arg2[%c16 to %arg3], %arg0[%c0 to %arg3], %c8 : !stream.resource<*>{%arg3} -> %arg0 as !stream.resource<*>{%arg1}
 
-  return %0, %1 : !stream.resource<*>, !stream.resource<*>
+  util.return %0, %1 : !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @FoldAsyncTransferOp
-func.func @FoldAsyncTransferOp(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.resource<transient> {
+util.func private @FoldAsyncTransferOp(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.resource<transient> {
   // CHECK-NOT: stream.async.transfer
   %0 = stream.async.transfer %arg0 : !stream.resource<transient>{%arg1} -> !stream.resource<staging>{%arg1}
   %1 = stream.async.transfer %0 : !stream.resource<staging>{%arg1} -> !stream.resource<transient>{%arg1}
-  return %1 : !stream.resource<transient>
+  util.return %1 : !stream.resource<transient>
 }
 
 // -----
 
 // CHECK-LABEL: @RedundantTransferElision
-func.func @RedundantTransferElision(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.resource<transient> {
+util.func private @RedundantTransferElision(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.resource<transient> {
   // CHECK-NOT: stream.async.transfer
   %0 = stream.async.transfer %arg0 : !stream.resource<transient>{%arg1} -> !stream.resource<transient>{%arg1}
-  return %0 : !stream.resource<transient>
+  util.return %0 : !stream.resource<transient>
 }
 
 // -----
 
 // CHECK-LABEL: @IntermediateTransferElision
 // CHECK-SAME: (%[[SOURCE:.+]]: !stream.resource<constant>, %[[SIZE:.+]]: index)
-func.func @IntermediateTransferElision(%source: !stream.resource<constant>, %size: index) -> !stream.resource<external> {
+util.func private @IntermediateTransferElision(%source: !stream.resource<constant>, %size: index) -> !stream.resource<external> {
   // CHECK: %[[TRANSFER:.+]] = stream.async.transfer %[[SOURCE]] : !stream.resource<constant>{%[[SIZE]]} -> !stream.resource<external>{%[[SIZE]]}
   %transfer0 = stream.async.transfer %source : !stream.resource<constant>{%size} -> !stream.resource<staging>{%size}
   // CHECK-NOT: stream.async.transfer
   %transfer1 = stream.async.transfer %transfer0 : !stream.resource<staging>{%size} -> !stream.resource<external>{%size}
-  // CHECK-NEXT: return %[[TRANSFER]]
-  return %transfer1 : !stream.resource<external>
+  // CHECK-NEXT: util.return %[[TRANSFER]]
+  util.return %transfer1 : !stream.resource<external>
 }
 
 // -----
 
 // CHECK-LABEL: @FoldAsyncLoadBitcast
-func.func @FoldAsyncLoadBitcast(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
+util.func private @FoldAsyncLoadBitcast(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: %[[F32:.+]] = stream.async.load %arg0[%c0] : !stream.resource<staging>{%arg1} -> f32
   %0 = stream.async.load %arg0[%c0] : !stream.resource<staging>{%arg1} -> i32
   // CHECK-NOT: arith.bitcast
   %1 = arith.bitcast %0 : i32 to f32
-  // CHECK: return %[[F32]]
-  return %1 : f32
+  // CHECK: util.return %[[F32]]
+  util.return %1 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @FoldAsyncStoreBitcast
-func.func @FoldAsyncStoreBitcast(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
+util.func private @FoldAsyncStoreBitcast(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   %0 = arith.bitcast %arg2 : f32 to i32
   // CHECK: = stream.async.store %arg2, %arg0[%c0] : f32 -> %arg0 as !stream.resource<staging>{%arg1}
   %1 = stream.async.store %0, %arg0[%c0] : i32 -> %arg0 as !stream.resource<staging>{%arg1}
-  return %1 : !stream.resource<staging>
+  util.return %1 : !stream.resource<staging>
 }
 
 // -----
 
 // CHECK-LABEL: @ElideImmediateAsyncExecuteWaits
-func.func @ElideImmediateAsyncExecuteWaits(%arg0: !stream.resource<*>, %arg1: index) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @ElideImmediateAsyncExecuteWaits(%arg0: !stream.resource<*>, %arg1: index) -> (!stream.resource<*>, !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-NOT: stream.timepoint.immediate
@@ -369,13 +369,13 @@
     // CHECK: stream.yield
     stream.yield %1 : !stream.resource<*>{%arg1}
   } => !stream.timepoint
-  return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ChainAsyncExecuteWaits
-func.func @ChainAsyncExecuteWaits(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @ChainAsyncExecuteWaits(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.resource<*>, !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-NOT: stream.timepoint.await
@@ -387,13 +387,13 @@
     // CHECK: stream.yield
     stream.yield %1 : !stream.resource<*>{%arg1}
   } => !stream.timepoint
-  return %1#0, %1#1 : !stream.resource<*>, !stream.timepoint
+  util.return %1#0, %1#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @CloneCapturedAsyncExecuteSubviewOps
-func.func @CloneCapturedAsyncExecuteSubviewOps(%arg0: !stream.resource<*>, %arg1: index) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @CloneCapturedAsyncExecuteSubviewOps(%arg0: !stream.resource<*>, %arg1: index) -> (!stream.resource<*>, !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -407,26 +407,26 @@
     // CHECK: stream.yield
     stream.yield %1 : !stream.resource<*>{%c128}
   } => !stream.timepoint
-  return %1#0, %1#1 : !stream.resource<*>, !stream.timepoint
+  util.return %1#0, %1#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideNoOpAsyncExecuteOp
-func.func @ElideNoOpAsyncExecuteOp(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @ElideNoOpAsyncExecuteOp(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.resource<*>, !stream.timepoint) {
   // CHECK-NOT: stream.async.execute
   %1:2 = stream.async.execute await(%arg2) => with(%arg0 as %arg3: !stream.resource<*>{%arg1}) -> %arg0{%arg1} {
     stream.yield %arg3 : !stream.resource<*>{%arg1}
   } => !stream.timepoint
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
-  // CHECK: return %arg0, %[[IMM]]
-  return %1#0, %1#1 : !stream.resource<*>, !stream.timepoint
+  // CHECK: util.return %arg0, %[[IMM]]
+  util.return %1#0, %1#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @TieRegionResultsAsyncExecuteOp
-func.func @TieRegionResultsAsyncExecuteOp(%arg0: !stream.resource<*>, %arg1: index) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @TieRegionResultsAsyncExecuteOp(%arg0: !stream.resource<*>, %arg1: index) -> (!stream.resource<*>, !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: = stream.async.execute with(%arg0 as %arg2: !stream.resource<*>{%arg1}) -> %arg0{%arg1}
@@ -436,13 +436,13 @@
     // CHECK: stream.yield %[[T]]
     stream.yield %1 : !stream.resource<*>{%arg1}
   } => !stream.timepoint
-  return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideUnusedAsyncExecuteOp
-func.func @ElideUnusedAsyncExecuteOp(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) {
+util.func private @ElideUnusedAsyncExecuteOp(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-NOT: stream.async.execute
@@ -450,13 +450,13 @@
     %1 = stream.async.dispatch @executable::@dispatch0[%c1, %c1, %c1](%arg3[%c0 to %arg1 for %arg1]) : (!stream.resource<*>{%arg1}) -> !stream.resource<*>{%arg1}
     stream.yield %1 : !stream.resource<*>{%arg1}
   } => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @TieRegionResultsAsyncConcurrentOp
-func.func @TieRegionResultsAsyncConcurrentOp(%arg0: !stream.resource<*>, %arg1: index) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @TieRegionResultsAsyncConcurrentOp(%arg0: !stream.resource<*>, %arg1: index) -> (!stream.resource<*>, !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: = stream.async.execute with(%arg0 as %arg2: !stream.resource<*>{%arg1}) -> %arg0{%arg1}
@@ -471,13 +471,13 @@
     // CHECK: stream.yield %[[EXEC_T]]
     stream.yield %1 : !stream.resource<*>{%arg1}
   } => !stream.timepoint
-  return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideUnusedAsyncConcurrentOp
-func.func @ElideUnusedAsyncConcurrentOp(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @ElideUnusedAsyncConcurrentOp(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.resource<*>, !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: stream.async.execute
@@ -492,5 +492,5 @@
     }
     stream.yield %1 : !stream.resource<*>{%arg1}
   } => !stream.timepoint
-  return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/async_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/async_ops.mlir
index f136ea8..0c6085a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/async_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/async_ops.mlir

@@ -1,81 +1,81 @@
 // RUN: iree-opt --split-input-file %s --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: @asyncAlloca
-func.func @asyncAlloca(%arg0: index) -> !stream.resource<transient> {
+util.func private @asyncAlloca(%arg0: index) -> !stream.resource<transient> {
   // CHECK: = stream.async.alloca : !stream.resource<transient>{%arg0}
   %0 = stream.async.alloca : !stream.resource<transient>{%arg0}
-  return %0 : !stream.resource<transient>
+  util.return %0 : !stream.resource<transient>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncConstant
-func.func @asyncConstant(%arg0: index) -> !stream.resource<transient> {
+util.func private @asyncConstant(%arg0: index) -> !stream.resource<transient> {
   // CHECK: = stream.async.constant : !stream.resource<transient>{%arg0} = dense<3> : tensor<8xi32>
   %0 = stream.async.constant : !stream.resource<transient>{%arg0} = dense<3> : tensor<8xi32>
-  return %0 : !stream.resource<transient>
+  util.return %0 : !stream.resource<transient>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncSplat
-func.func @asyncSplat(%arg0: index, %arg1: i32) -> !stream.resource<*> {
+util.func private @asyncSplat(%arg0: index, %arg1: i32) -> !stream.resource<*> {
   // CHECK: = stream.async.splat %arg1 : i32 -> !stream.resource<*>{%arg0}
   %0 = stream.async.splat %arg1 : i32 -> !stream.resource<*>{%arg0}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncClone
-func.func @asyncClone(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @asyncClone(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   // CHECK: = stream.async.clone %arg0 : !stream.resource<*>{%arg1} -> !stream.resource<*>{%arg1}
   %0 = stream.async.clone %arg0 : !stream.resource<*>{%arg1} -> !stream.resource<*>{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncSlice
-func.func @asyncSlice(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @asyncSlice(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: = stream.async.slice %arg0[%c0 to %c128] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%c128}
   %0 = stream.async.slice %arg0[%c0 to %c128] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%c128}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncFill
-func.func @asyncFill(%arg0: !stream.resource<*>, %arg1: index, %arg2: i32) -> !stream.resource<*> {
+util.func private @asyncFill(%arg0: !stream.resource<*>, %arg1: index, %arg2: i32) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: = stream.async.fill %arg2, %arg0[%c0 to %c128 for %c128] : i32 -> %arg0 as !stream.resource<*>{%arg1}
   %0 = stream.async.fill %arg2, %arg0[%c0 to %c128 for %c128] : i32 -> %arg0 as !stream.resource<*>{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncUpdate
-func.func @asyncUpdate(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index) -> !stream.resource<*> {
+util.func private @asyncUpdate(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: = stream.async.update %arg2, %arg0[%c0 to %c128] : !stream.resource<*>{%arg3} -> %arg0 as !stream.resource<*>{%arg1}
   %0 = stream.async.update %arg2, %arg0[%c0 to %c128] : !stream.resource<*>{%arg3} -> %arg0 as !stream.resource<*>{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncCopy
-func.func @asyncCopy(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index) -> !stream.resource<*> {
+util.func private @asyncCopy(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: = stream.async.copy %arg2[%c0 to %c128], %arg0[%c0 to %c128], %c128 : !stream.resource<*>{%arg3} -> %arg0 as !stream.resource<*>{%arg1}
   %0 = stream.async.copy %arg2[%c0 to %c128], %arg0[%c0 to %c128], %c128 : !stream.resource<*>{%arg3} -> %arg0 as !stream.resource<*>{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -83,7 +83,7 @@
 // This covers all_gather, all_reduce, and reduce_scatter variants.
 
 // CHECK-LABEL: @asyncCollectiveAllGather
-func.func @asyncCollectiveAllGather(
+util.func private @asyncCollectiveAllGather(
     // CHECK-SAME: %[[CHANNEL:.+]]: !stream.channel,
     %channel: !stream.channel,
     // CHECK-SAME: %[[SEND:[a-z0-9]+]]: !stream.resource<*>, %[[SEND_SIZE:[a-z0-9]+]]: index,
@@ -103,7 +103,7 @@
       %recv[%c0 to %recv_size for %recv_size] :
       // CHECK-SAME: !stream.resource<*>{%[[SEND_SIZE]]} -> %[[RECV]] as !stream.resource<*>{%[[RECV_SIZE]]}
       !stream.resource<*>{%send_size} -> %recv as !stream.resource<*>{%recv_size}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -111,7 +111,7 @@
 // This covers broadcast and reduce variants.
 
 // CHECK-LABEL: @asyncCollectiveBroadcast
-func.func @asyncCollectiveBroadcast(
+util.func private @asyncCollectiveBroadcast(
     // CHECK-SAME: %[[CHANNEL:.+]]: !stream.channel,
     %channel: !stream.channel,
     // CHECK-SAME: %[[RANK:[a-z0-9]+]]: i32,
@@ -133,42 +133,42 @@
       %recv[%c0 to %recv_size for %recv_size] :
       // CHECK-SAME: !stream.resource<*>{%[[SEND_SIZE]]} -> %[[RECV]] as !stream.resource<*>{%[[RECV_SIZE]]}
       !stream.resource<*>{%send_size} -> %recv as !stream.resource<*>{%recv_size}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncTransfer
-func.func @asyncTransfer(%arg0: !stream.resource<constant>, %arg1: index) -> !stream.resource<staging> {
+util.func private @asyncTransfer(%arg0: !stream.resource<constant>, %arg1: index) -> !stream.resource<staging> {
   // CHECK: = stream.async.transfer %arg0 : !stream.resource<constant>{%arg1} -> !stream.resource<staging>{%arg1}
   %0 = stream.async.transfer %arg0 : !stream.resource<constant>{%arg1} -> !stream.resource<staging>{%arg1}
-  return %0 : !stream.resource<staging>
+  util.return %0 : !stream.resource<staging>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncLoad
-func.func @asyncLoad(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
+util.func private @asyncLoad(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: = stream.async.load %arg0[%c0] : !stream.resource<staging>{%arg1} -> f32
   %0 = stream.async.load %arg0[%c0] : !stream.resource<staging>{%arg1} -> f32
-  return %0 : f32
+  util.return %0 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @asyncStore
-func.func @asyncStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
+util.func private @asyncStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   // CHECK: = stream.async.store %arg2, %arg0[%c0] : f32 -> %arg0 as !stream.resource<staging>{%arg1}
   %0 = stream.async.store %arg2, %arg0[%c0] : f32 -> %arg0 as !stream.resource<staging>{%arg1}
-  return %0 : !stream.resource<staging>
+  util.return %0 : !stream.resource<staging>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncDispatch
-func.func @asyncDispatch(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @asyncDispatch(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -176,17 +176,17 @@
   %c4 = arith.constant 4 : index
   // CHECK: = stream.async.dispatch @executable::@dispatch[%c1, %c2, %c3](%arg0[%c0 to %arg1 for %arg1], %c4) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
   %0 = stream.async.dispatch @executable::@dispatch[%c1, %c2, %c3](%arg0[%c0 to %arg1 for %arg1], %c4) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncDispatchNoInputs
-func.func @asyncDispatchNoInputs(%arg0: index) -> !stream.resource<*> {
+util.func private @asyncDispatchNoInputs(%arg0: index) -> !stream.resource<*> {
   %c1 = arith.constant 1 : index
   // CHECK: = stream.async.dispatch @executable::@dispatch[%c1]() : () -> !stream.resource<*>{%arg0}
   %0 = stream.async.dispatch @executable::@dispatch[%c1]() : () -> !stream.resource<*>{%arg0}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -196,21 +196,21 @@
     stream.return %arg0, %arg1, %arg0 : index, index, index
   }
   builtin.module {
-    func.func @dispatch() {
-      return
+    util.func private @dispatch() {
+      util.return
     }
   }
 }
 
 // CHECK-LABEL: @asyncDispatchWithWorkgroupCount
-func.func @asyncDispatchWithWorkgroupCount(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @asyncDispatchWithWorkgroupCount(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
   // CHECK: = stream.async.dispatch @executable::@dispatch[%c1, %c2](%arg0[%c0 to %arg1 for %arg1], %c4) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
   %0 = stream.async.dispatch @executable::@dispatch[%c1, %c2](%arg0[%c0 to %arg1 for %arg1], %c4) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -220,31 +220,31 @@
     stream.return %arg0, %arg0, %arg0 : index, index, index
   }
   builtin.module {
-    func.func @dispatch() {
-      return
+    util.func private @dispatch() {
+      util.return
     }
   }
 }
 
-func.func @asyncDispatchWithInvalidWorkload(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @asyncDispatchWithInvalidWorkload(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
   // expected-error @+1 {{op workload mismatch; entry point expects 1 arguments but dispatch provides 2}}
   %0 = stream.async.dispatch @executable::@dispatch[%c1, %c2](%arg0[%c0 to %arg1 for %arg1], %c4) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncDispatchNoWorkload
-func.func @asyncDispatchNoWorkload(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @asyncDispatchNoWorkload(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c4 = arith.constant 4 : index
   // CHECK: = stream.async.dispatch @executable::@dispatch(%arg0[%c0 to %arg1 for %arg1], %c4) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
   %0 = stream.async.dispatch @executable::@dispatch(%arg0[%c0 to %arg1 for %arg1], %c4) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -253,17 +253,17 @@
 
 // CHECK-LABEL: @asyncCall
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>, %[[SIZE0:.+]]: index)
-func.func @asyncCall(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @asyncCall(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   // CHECK: = stream.async.call @asyncExtern(%[[ARG0]][%c0 to %[[SIZE0]] for %[[SIZE0]]], %[[SIZE0]]) : (!stream.resource<*>{%[[SIZE0]]}, index) -> %[[ARG0]]{%[[SIZE0]]}
   %call = stream.async.call @asyncExtern(%arg0[%c0 to %arg1 for %arg1], %arg1) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
-  return %call : !stream.resource<*>
+  util.return %call : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @asyncExecute
-func.func @asyncExecute(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @asyncExecute(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.resource<*>, !stream.timepoint) {
   // CHECK: = stream.async.execute await(%arg2) => with(%arg0 as %arg3: !stream.resource<*>{%arg1}) -> %arg0{%arg1} {
   %0:2 = stream.async.execute await(%arg2) => with(%arg0 as %arg3: !stream.resource<*>{%arg1}) -> %arg0 as !stream.resource<*>{%arg1} {
     // CHECK: %[[W:.+]] = stream.async.concurrent with(%arg3 as %arg4: !stream.resource<*>{%arg1}) -> %arg3{%arg1} {
@@ -274,13 +274,13 @@
     // CHECK: stream.yield %[[W]] : !stream.resource<*>{%arg1}
     stream.yield %1 : !stream.resource<*>{%arg1}
   } => !stream.timepoint
-  return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @asyncExecuteNoCaptures
-func.func @asyncExecuteNoCaptures(%arg0: index, %arg1: i32) -> (!stream.resource<*>, !stream.timepoint) {
+util.func private @asyncExecuteNoCaptures(%arg0: index, %arg1: i32) -> (!stream.resource<*>, !stream.timepoint) {
   // CHECK: = stream.async.execute with() -> !stream.resource<*>{%arg0} {
   %0:2 = stream.async.execute with() -> !stream.resource<*>{%arg0} {
     // CHECK: %[[T:.+]] = stream.async.splat
@@ -288,17 +288,17 @@
     // CHECK: stream.yield %[[T]] : !stream.resource<*>{%arg0}
     stream.yield %1 : !stream.resource<*>{%arg0}
   } => !stream.timepoint
-  return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @asyncExecuteNoResults
-func.func @asyncExecuteNoResults(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.timepoint) {
+util.func private @asyncExecuteNoResults(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.timepoint) -> (!stream.timepoint) {
   // CHECK: = stream.async.execute await(%arg2) => with(%arg0 as %arg3: !stream.resource<*>{%arg1}) {
   %0:1 = stream.async.execute await(%arg2) => with(%arg0 as %arg3: !stream.resource<*>{%arg1}) {
     // CHECK: stream.yield
     stream.yield
   } => !stream.timepoint
-  return %0#0 : !stream.timepoint
+  util.return %0#0 : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/channel_folding.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/channel_folding.mlir
index 282f5a7..1acb552 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/channel_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/channel_folding.mlir

@@ -2,42 +2,42 @@
 
 // CHECK-LABEL: @FoldChannelRankOp
 //  CHECK-SAME: (%[[RANK:.+]]: index)
-func.func @FoldChannelRankOp(%rank: index) -> index {
+util.func private @FoldChannelRankOp(%rank: index) -> index {
   %channel = stream.channel.create rank(%rank) : !stream.channel
   %queried_rank = stream.channel.rank %channel : index
-  // CHECK: return %[[RANK]]
-  return %queried_rank : index
+  // CHECK: util.return %[[RANK]]
+  util.return %queried_rank : index
 }
 
 // -----
 
 // CHECK-LABEL: @NoFoldChannelRankOp
-func.func @NoFoldChannelRankOp() -> index {
+util.func private @NoFoldChannelRankOp() -> index {
   %channel = stream.channel.create : !stream.channel
   // CHECK: %[[RANK:.+]] = stream.channel.rank
   %queried_rank = stream.channel.rank %channel : index
-  // CHECK: return %[[RANK]]
-  return %queried_rank : index
+  // CHECK: util.return %[[RANK]]
+  util.return %queried_rank : index
 }
 
 // -----
 
 // CHECK-LABEL: @FoldChannelCountOp
 //  CHECK-SAME: (%[[COUNT:.+]]: index)
-func.func @FoldChannelCountOp(%count: index) -> index {
+util.func private @FoldChannelCountOp(%count: index) -> index {
   %channel = stream.channel.create count(%count) : !stream.channel
   %queried_count = stream.channel.count %channel : index
-  // CHECK: return %[[COUNT]]
-  return %queried_count : index
+  // CHECK: util.return %[[COUNT]]
+  util.return %queried_count : index
 }
 
 // -----
 
 // CHECK-LABEL: @NoFoldChannelCountOp
-func.func @NoFoldChannelCountOp() -> index {
+util.func private @NoFoldChannelCountOp() -> index {
   %channel = stream.channel.create : !stream.channel
   // CHECK: %[[COUNT:.+]] = stream.channel.count
   %queried_count = stream.channel.count %channel : index
-  // CHECK: return %[[COUNT]]
-  return %queried_count : index
+  // CHECK: util.return %[[COUNT]]
+  util.return %queried_count : index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/channel_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/channel_ops.mlir
index 7c4d7c5..486a03f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/channel_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/channel_ops.mlir

@@ -2,42 +2,42 @@
 
 // CHECK-LABEL: @channel_create
 //  CHECK-SAME: (%[[RANK:.+]]: index, %[[COUNT:.+]]: index)
-func.func @channel_create(%rank: index, %count: index) {
+util.func private @channel_create(%rank: index, %count: index) {
   // CHECK: %channel = stream.channel.create on(#hal.affinity.queue<[0, 1]>) rank(%[[RANK]]) count(%[[COUNT]]) : !stream.channel
   %channel = stream.channel.create on(#hal.affinity.queue<[0, 1]>) rank(%rank) count(%count) : !stream.channel
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @channel_split
 //  CHECK-SAME: (%[[BASE_CHANNEL:.+]]: !stream.channel)
-func.func @channel_split(%base_channel: !stream.channel) {
+util.func private @channel_split(%base_channel: !stream.channel) {
   // CHECK-DAG: %[[COLOR:.+]] = arith.constant 100 : index
   %color = arith.constant 100 : index
   // CHECK-DAG: %[[KEY:.+]] = arith.constant 101 : index
   %key = arith.constant 101 : index
   // CHECK: %channel = stream.channel.split %[[BASE_CHANNEL]], %[[COLOR]], %[[KEY]] : !stream.channel -> !stream.channel
   %split_channel = stream.channel.split %base_channel, %color, %key : !stream.channel -> !stream.channel
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @channel_rank
 //  CHECK-SAME: (%[[CHANNEL:.+]]: !stream.channel)
-func.func @channel_rank(%channel: !stream.channel) -> index {
+util.func private @channel_rank(%channel: !stream.channel) -> index {
   // CHECK: = stream.channel.rank %[[CHANNEL]] : index
   %rank = stream.channel.rank %channel : index
-  return %rank : index
+  util.return %rank : index
 }
 
 // -----
 
 // CHECK-LABEL: @channel_count
 //  CHECK-SAME: (%[[CHANNEL:.+]]: !stream.channel)
-func.func @channel_count(%channel: !stream.channel) -> index {
+util.func private @channel_count(%channel: !stream.channel) -> index {
   // CHECK: = stream.channel.count %[[CHANNEL]] : index
   %count = stream.channel.count %channel : index
-  return %count : index
+  util.return %count : index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/cmd_folding.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/cmd_folding.mlir
index 0d76525..5f8b27f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/cmd_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/cmd_folding.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --canonicalize %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @FoldSubviewsIntoCmdTOp
-func.func @FoldSubviewsIntoCmdTOp(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func private @FoldSubviewsIntoCmdTOp(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c64 = arith.constant 64 : index
   %c1000 = arith.constant 1000 : index
@@ -19,13 +19,13 @@
     // CHECK: stream.cmd.fill %c255_i32, %arg2[%c1064 for %c2000] : i32 -> !stream.resource<transient>{%arg1}
     stream.cmd.fill %c255_i32, %arg2[%c1000 for %c2000] : i32 -> !stream.resource<transient>{%c3000}
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubviewsIntoCmdCopyOp
-func.func @FoldSubviewsIntoCmdCopyOp(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func private @FoldSubviewsIntoCmdCopyOp(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
@@ -39,13 +39,13 @@
     // CHECK: stream.cmd.copy %arg2[%c1064], %arg2[%c2128], %c1000 : !stream.resource<transient>{%arg1} -> !stream.resource<transient>{%arg1}
     stream.cmd.copy %arg2[%c1000], %arg3[%c2000], %c1000 : !stream.resource<transient>{%c3000} -> !stream.resource<transient>{%c4000}
   } => !stream.timepoint
-  return %2 : !stream.timepoint
+  util.return %2 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubviewsIntoCmdDispatchOp
-func.func @FoldSubviewsIntoCmdDispatchOp(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func private @FoldSubviewsIntoCmdDispatchOp(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c64 = arith.constant 64 : index
@@ -65,13 +65,13 @@
       wo %arg3[%c2000 for %c1000] : !stream.resource<transient>{%c4000}
     }
   } => !stream.timepoint
-  return %2 : !stream.timepoint
+  util.return %2 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideImmediateCmdExecuteWaits
-func.func @ElideImmediateCmdExecuteWaits(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func private @ElideImmediateCmdExecuteWaits(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: stream.timepoint.immediate
   %imm = stream.timepoint.immediate => !stream.timepoint
@@ -79,13 +79,13 @@
   %0 = stream.cmd.execute await(%imm) => with(%arg0 as %arg2: !stream.resource<transient>{%arg1}) {
     stream.cmd.discard %arg2[%c0 for %arg1] : !stream.resource<transient>{%arg1}
   } => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ChainCmdExecuteWaits
-func.func @ChainCmdExecuteWaits(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.timepoint) -> !stream.timepoint {
+util.func private @ChainCmdExecuteWaits(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.timepoint) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK-NOT: stream.timepoint.await
@@ -95,13 +95,13 @@
     // CHECK: stream.cmd.discard
     stream.cmd.discard %arg3[%c0 for %c128] : !stream.resource<transient>{%arg1}
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @CloneCapturedCmdExecuteSubviewOps
-func.func @CloneCapturedCmdExecuteSubviewOps(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func private @CloneCapturedCmdExecuteSubviewOps(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
@@ -115,30 +115,30 @@
     // CHECK: stream.cmd.discard %arg2[%c1064 for %c2000] : !stream.resource<transient>{%arg1}
     stream.cmd.discard %arg3[%c1000 for %c2000] : !stream.resource<transient>{%arg1}
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideNoOpCmdExecuteOp
-func.func @ElideNoOpCmdExecuteOp(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.timepoint) -> !stream.timepoint {
+util.func private @ElideNoOpCmdExecuteOp(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.timepoint) -> !stream.timepoint {
   // CHECK-NOT: stream.cmd.execute
   %0 = stream.cmd.execute await(%arg2) => with(%arg0 as %arg3: !stream.resource<transient>{%arg1}) {
   } => !stream.timepoint
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
-  // CHECK: return %[[IMM]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[IMM]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideUnusedCmdExecuteOp
-func.func @ElideUnusedCmdExecuteOp(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.timepoint) {
+util.func private @ElideUnusedCmdExecuteOp(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK-NOT: stream.cmd.execute
   %0 = stream.cmd.execute await(%arg2) => with(%arg0 as %arg3: !stream.resource<transient>{%arg1}) {
     stream.cmd.discard %arg3[%c0 for %c128] : !stream.resource<transient>{%arg1}
   } => !stream.timepoint
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/cmd_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/cmd_ops.mlir
index 9298c0f..d75df4c 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/cmd_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/cmd_ops.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --allow-unregistered-dialect %s | iree-opt --split-input-file --allow-unregistered-dialect | FileCheck %s
 
 // CHECK-LABEL: @cmdMemoryControl
-func.func @cmdMemoryControl(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func private @cmdMemoryControl(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %0 = stream.cmd.execute with(%arg0 as %arg2: !stream.resource<transient>{%arg1}) {
@@ -12,13 +12,13 @@
     // CHECK: stream.cmd.discard %arg2[%c0 for %c128] : !stream.resource<transient>{%arg1}
     stream.cmd.discard %arg2[%c0 for %c128] : !stream.resource<transient>{%arg1}
   } => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdFill
-func.func @cmdFill(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func private @cmdFill(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c255_i32 = arith.constant 255 : i32
@@ -26,27 +26,27 @@
     // CHECK: stream.cmd.fill %c255_i32, %arg2[%c0 for %c128] : i32 -> !stream.resource<transient>{%arg1}
     stream.cmd.fill %c255_i32, %arg2[%c0 for %c128] : i32 -> !stream.resource<transient>{%arg1}
   } => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdCopy
-func.func @cmdCopy(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index) -> !stream.timepoint {
+util.func private @cmdCopy(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %0 = stream.cmd.execute with(%arg0 as %arg4: !stream.resource<transient>{%arg1}, %arg2 as %arg5: !stream.resource<staging>{%arg3}) {
     // CHECK: stream.cmd.copy %arg4[%c0], %arg5[%c0], %c128 : !stream.resource<transient>{%arg1} -> !stream.resource<staging>{%arg3}
     stream.cmd.copy %arg4[%c0], %arg5[%c0], %c128 : !stream.resource<transient>{%arg1} -> !stream.resource<staging>{%arg3}
   } => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdCollective
 // CHECK-SAME: %[[CHANNEL:[a-z0-9]+]]: !stream.channel
-func.func @cmdCollective(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<transient>, %arg3: index, %channel: !stream.channel) -> !stream.timepoint {
+util.func private @cmdCollective(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<transient>, %arg3: index, %channel: !stream.channel) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: stream.cmd.execute
@@ -91,13 +91,13 @@
     }
 
   } => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdDispatch
-func.func @cmdDispatch(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<external>, %arg3: index) -> !stream.timepoint {
+util.func private @cmdDispatch(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<external>, %arg3: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -115,7 +115,7 @@
       wo %arg5[%c0 for %c128] : !stream.resource<external>{%arg3}
     }
   } => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -123,8 +123,8 @@
 // CHECK: stream.cmd.func private @cmdFunc(%arg0[%arg1 for %arg2]: !stream.resource<*>, %arg3: i32, %arg4[%arg5 for %arg6]: !stream.resource<*>, %arg7: !custom.type, %arg8[%arg9 for %arg10]: !stream.resource<*>)
 stream.cmd.func private @cmdFunc(%arg0[%arg1 for %arg2]: !stream.resource<*>, %arg3: i32, %arg4[%arg5 for %arg6]: !stream.resource<*>, %arg7: !custom.type, %arg8[%arg9 for %arg10]: !stream.resource<*>)
 
-// CHECK-LABEL: func.func @cmdCall
-func.func @cmdCall(%arg0: !stream.resource<external>, %arg1: i32, %arg2: !stream.resource<external>, %arg3: !custom.type, %arg4: !stream.resource<external>) -> !stream.timepoint {
+// CHECK-LABEL: util.func private @cmdCall
+util.func private @cmdCall(%arg0: !stream.resource<external>, %arg1: i32, %arg2: !stream.resource<external>, %arg3: !custom.type, %arg4: !stream.resource<external>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %size0 = arith.constant 100 : index
   %size1 = arith.constant 101 : index
@@ -134,13 +134,13 @@
     // CHECK: stream.cmd.call @cmdFunc(ro %[[STREAM0]][%c0 for %[[SIZE0]]], %arg1, rw %[[STREAM1]][%c0 for %[[SIZE1]]], %arg3, wo %[[STREAM2]][%c0 for %[[SIZE2]]]) : (!stream.resource<external>{%[[SIZE0]]}, i32, !stream.resource<external>{%[[SIZE1]]}, !custom.type, !stream.resource<external>{%[[SIZE2]]}) -> ()
     stream.cmd.call @cmdFunc(ro %stream0[%c0 for %size0], %arg1, rw %stream1[%c0 for %size1], %arg3, wo %stream2[%c0 for %size2]) : (!stream.resource<external>{%size0}, i32, !stream.resource<external>{%size1}, !custom.type, !stream.resource<external>{%size2}) -> ()
   } => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdExecute
-func.func @cmdExecute(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: !stream.timepoint) -> !stream.timepoint {
+util.func private @cmdExecute(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: !stream.timepoint) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: = stream.cmd.execute await(%arg4) => with(%arg0 as %arg5: !stream.resource<transient>{%arg1}, %arg2 as %arg6: !stream.resource<staging>{%arg3}) {
@@ -161,5 +161,5 @@
     }
   // CHECK: } => !stream.timepoint
   } => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/context_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/context_ops.mlir
index c324ff3..ab523ec 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/context_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/context_ops.mlir

@@ -1,12 +1,12 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @context_resolve
-func.func @context_resolve() {
+util.func private @context_resolve() {
   // CHECK: = stream.context.resolve : !hal.allocator
   %allocator = stream.context.resolve : !hal.allocator
   // CHECK: = stream.context.resolve on(#hal.affinity.queue<*>) : !hal.device, i64
   %device1, %queue_affinity_any = stream.context.resolve on(#hal.affinity.queue<*>) : !hal.device, i64
   // CHECK: = stream.context.resolve on(#hal.affinity.queue<[4, 5]>) : !hal.device, i64
   %device0, %queue_affinity_45 = stream.context.resolve on(#hal.affinity.queue<[4, 5]>) : !hal.device, i64
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/executable_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/executable_ops.mlir
index 985e3de..c11de1d 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/executable_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/executable_ops.mlir

@@ -6,15 +6,15 @@
   stream.executable.export public @dispatch
   // CHECK-NEXT: builtin.module
   builtin.module {
-    // CHECK-NEXT: func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index) {
-    func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index) {
+    // CHECK-NEXT: util.func private @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index) {
+    util.func private @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index) {
       %c0 = arith.constant 0 : index
       // CHECK-DAG: = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x5x64xf32>>{%arg2}
       %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x5x64xf32>>{%arg2}
       // CHECK-DAG: = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x5x4xf32>>{%arg2}
       %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x5x4xf32>>{%arg2}
-      // CHECK: return
-      return
+      // CHECK: util.return
+      util.return
     }
   }
 }
@@ -32,10 +32,10 @@
       }
   // CHECK: builtin.module
   builtin.module {
-    // CHECK-NEXT: func.func @dispatch
-    func.func @dispatch() {
-      // CHECK: return
-      return
+    // CHECK-NEXT: util.func private @dispatch
+    util.func private @dispatch() {
+      // CHECK: util.return
+      util.return
     }
   }
 }
@@ -48,8 +48,8 @@
     stream.return %arg0, %arg1 : index, index
   }
   builtin.module  {
-    func.func @dispatch() {
-      return
+    util.func private @dispatch() {
+      util.return
     }
   }
 }
@@ -62,8 +62,8 @@
     stream.return %arg0, %arg1, %arg0 : index, f32, index
   }
   builtin.module  {
-    func.func @dispatch() {
-      return
+    util.func private @dispatch() {
+      util.return
     }
   }
 }
@@ -73,15 +73,15 @@
 stream.executable private @executable {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding, %arg1: index) {
+    util.func private @dispatch(%arg0: !stream.binding, %arg1: index) {
       %c0 = arith.constant 0 : index
       %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<?x5x64xf32>>{%arg1}
-      return
+      util.return
     }
   }
 }
 
-func.func @cmdDispatchExecutableSignatureMismatch(%arg0: !stream.resource<transient>,
+util.func private @cmdDispatchExecutableSignatureMismatch(%arg0: !stream.resource<transient>,
                                                   %arg1: index,
                                                   %arg2: !stream.resource<external>,
                                                   %arg3: index) -> !stream.timepoint {
@@ -96,5 +96,5 @@
       wo %arg5[%c0 for %c128] : !stream.resource<external>{%arg3}
     }
   } => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/file_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/file_ops.mlir
index dbed4df..94680ce 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/file_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/file_ops.mlir

@@ -2,36 +2,36 @@
 
 // CHECK-LABEL: @file_constant
 //  CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer)
-func.func @file_constant(%buffer: !util.buffer) {
+util.func private @file_constant(%buffer: !util.buffer) {
   %c0 = arith.constant 0 : index
   %c1088 = arith.constant 1088 : index
   // CHECK: %file = stream.file.constant %[[BUFFER]][%c0 for %c1088] : !util.buffer{%c1088} -> !stream.file
   %file = stream.file.constant %buffer[%c0 for %c1088] : !util.buffer{%c1088} -> !stream.file
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @file_read
 //  CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[FILE:.+]]: !stream.file, %[[RESOURCE:.+]]: !stream.resource<variable>)
-func.func @file_read(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) {
+util.func private @file_read(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) {
   %c0 = arith.constant 0 : index
   %c0_i64 = arith.constant 0 : i64
   %c1088 = arith.constant 1088 : index
   // CHECK: = stream.file.read await(%[[WAIT]]) => %[[FILE]][%c0_i64], %[[RESOURCE]][%c0], %c1088 : !stream.file -> !stream.resource<variable>{%c1088} => !stream.timepoint
   %0 = stream.file.read await(%wait) => %file[%c0_i64], %resource[%c0], %c1088 : !stream.file -> !stream.resource<variable>{%c1088} => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @file_write
 //  CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[FILE:.+]]: !stream.file, %[[RESOURCE:.+]]: !stream.resource<variable>)
-func.func @file_write(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) {
+util.func private @file_write(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) {
   %c0 = arith.constant 0 : index
   %c0_i64 = arith.constant 0 : i64
   %c1088 = arith.constant 1088 : index
   // CHECK: = stream.file.write await(%[[WAIT]]) => %[[RESOURCE]][%c0], %[[FILE]][%c0_i64], %c1088 : !stream.resource<variable>{%c1088} -> !stream.file => !stream.timepoint
   %0 = stream.file.write await(%wait) => %resource[%c0], %file[%c0_i64], %c1088 : !stream.resource<variable>{%c1088} -> !stream.file => !stream.timepoint
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/parameter_folding.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/parameter_folding.mlir
index cb9bf80..dfabc9a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/parameter_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/parameter_folding.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @FoldParameterLoadTargetSubview
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
-func.func @FoldParameterLoadTargetSubview(%wait: !stream.timepoint, %offset0: index, %length0: index, %offset1: index, %length1: index) -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
+util.func private @FoldParameterLoadTargetSubview(%wait: !stream.timepoint, %offset0: index, %length0: index, %offset1: index, %length1: index) -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c100 = arith.constant 100 : index
@@ -23,15 +23,15 @@
   %subview0 = stream.resource.subview %results#0[%offset0] : !stream.resource<constant>{%c100} -> !stream.resource<constant>{%length0}
   // CHECK-NOT: stream.resource.subview
   %subview1 = stream.resource.subview %results#1[%offset1] : !stream.resource<constant>{%c200} -> !stream.resource<constant>{%length1}
-  // CHECK: return %[[RESULTS]]#0, %[[RESULTS]]#1, %[[SIGNAL]]
-  return %subview0, %subview1, %result_timepoint : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
+  // CHECK: util.return %[[RESULTS]]#0, %[[RESULTS]]#1, %[[SIGNAL]]
+  util.return %subview0, %subview1, %result_timepoint : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @FoldParameterReadTargetSubview
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[TARGET:.+]]: !stream.resource<transient>, %[[OFFSET:.+]]: index, %[[LENGTH:.+]]: index)
-func.func @FoldParameterReadTargetSubview(%wait: !stream.timepoint, %target: !stream.resource<transient>, %offset: index, %length: index) -> !stream.timepoint {
+util.func private @FoldParameterReadTargetSubview(%wait: !stream.timepoint, %target: !stream.resource<transient>, %offset: index, %length: index) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -43,14 +43,14 @@
   %subview = stream.resource.subview %target[%offset] : !stream.resource<transient>{%length} -> !stream.resource<transient>{%c300}
   // CHECK: = stream.parameter.read await(%[[WAIT]]) => "scope"::"key"[%[[PARAMETER_OFFSET]]] -> %[[TARGET]][%[[RESOURCE_OFFSET]] for %c200] : !stream.resource<transient>{%[[LENGTH]]} => !stream.timepoint
   %timepoint = stream.parameter.read await(%wait) => "scope"::"key"[%c50_i64] -> %subview[%c100 for %c200] : !stream.resource<transient>{%c300} => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @FoldParameterWriteSourceSubview
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[SOURCE:.+]]: !stream.resource<transient>, %[[OFFSET:.+]]: index, %[[LENGTH:.+]]: index)
-func.func @FoldParameterWriteSourceSubview(%wait: !stream.timepoint, %source: !stream.resource<transient>, %offset: index, %length: index) -> !stream.timepoint {
+util.func private @FoldParameterWriteSourceSubview(%wait: !stream.timepoint, %source: !stream.resource<transient>, %offset: index, %length: index) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -62,5 +62,5 @@
   %subview = stream.resource.subview %source[%offset] : !stream.resource<transient>{%length} -> !stream.resource<transient>{%c300}
   // CHECK: = stream.parameter.write await(%[[WAIT]]) => %[[SOURCE]][%[[RESOURCE_OFFSET]] for %c200] : !stream.resource<transient>{%[[LENGTH]]} -> "scope"::"key"[%[[PARAMETER_OFFSET]]] => !stream.timepoint
   %timepoint = stream.parameter.write await(%wait) => %subview[%c100 for %c200] : !stream.resource<transient>{%c300} -> "scope"::"key"[%c50_i64] => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/parameter_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/parameter_ops.mlir
index eff0bb0..206103f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/parameter_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/parameter_ops.mlir

@@ -11,7 +11,7 @@
 
 // CHECK-LABEL: @parameterLoad
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint)
-func.func @parameterLoad(%wait: !stream.timepoint) -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
+util.func private @parameterLoad(%wait: !stream.timepoint) -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c100 = arith.constant 100 : index
@@ -24,14 +24,14 @@
     "scope"::"key0"[%c50_i64] : !stream.resource<constant>{%c100},
     "scope"::"key1"[%c51_i64] : !stream.resource<constant>{%c200}
   } => !stream.timepoint
-  return %results#0, %results#1, %result_timepoint : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
+  util.return %results#0, %results#1, %result_timepoint : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterLoadNoScope
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint)
-func.func @parameterLoadNoScope(%wait: !stream.timepoint) -> (!stream.resource<constant>, !stream.timepoint) {
+util.func private @parameterLoadNoScope(%wait: !stream.timepoint) -> (!stream.resource<constant>, !stream.timepoint) {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   // CHECK: = stream.parameter.load await(%[[WAIT]]) => {
@@ -40,42 +40,42 @@
   %result, %result_timepoint = stream.parameter.load await(%wait) => {
     "key"[%c50_i64] : !stream.resource<constant>{%c100}
   } => !stream.timepoint
-  return %result, %result_timepoint : !stream.resource<constant>, !stream.timepoint
+  util.return %result, %result_timepoint : !stream.resource<constant>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterRead
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[TARGET:.+]]: !stream.resource<transient>)
-func.func @parameterRead(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
+util.func private @parameterRead(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   %c300 = arith.constant 300 : index
   // CHECK: = stream.parameter.read await(%[[WAIT]]) => "scope"::"key"[%c50_i64] -> %[[TARGET]][%c100 for %c200] : !stream.resource<transient>{%c300} => !stream.timepoint
   %timepoint = stream.parameter.read await(%wait) => "scope"::"key"[%c50_i64] -> %target[%c100 for %c200] : !stream.resource<transient>{%c300} => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterWrite
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[SOURCE:.+]]: !stream.resource<transient>)
-func.func @parameterWrite(%wait: !stream.timepoint, %source: !stream.resource<transient>) -> !stream.timepoint {
+util.func private @parameterWrite(%wait: !stream.timepoint, %source: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   %c300 = arith.constant 300 : index
   // CHECK: = stream.parameter.write await(%[[WAIT]]) => %[[SOURCE]][%c100 for %c200] : !stream.resource<transient>{%c300} -> "scope"::"key"[%c50_i64] => !stream.timepoint
   %timepoint = stream.parameter.write await(%wait) => %source[%c100 for %c200] : !stream.resource<transient>{%c300} -> "scope"::"key"[%c50_i64] => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterGather
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[TARGET:.+]]: !stream.resource<transient>)
-func.func @parameterGather(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
+util.func private @parameterGather(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c52_i64 = arith.constant 52 : i64
@@ -96,14 +96,14 @@
     "scope"::"key1"[%c51_i64] -> %target[%c101 for %c201] : !stream.resource<transient>{%c300},
     "scope"::"key2"[%c52_i64] -> %target[%c102 for %c202] : !stream.resource<transient>{%c300}
   } => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterGatherNoScope
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[TARGET:.+]]: !stream.resource<transient>)
-func.func @parameterGatherNoScope(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
+util.func private @parameterGatherNoScope(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c100 = arith.constant 100 : index
@@ -119,14 +119,14 @@
     "key0"[%c50_i64] -> %target[%c100 for %c200] : !stream.resource<transient>{%c300},
     "key1"[%c51_i64] -> %target[%c101 for %c201] : !stream.resource<transient>{%c300}
   } => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterScatter
 // CHECK-SAME: (%[[WAIT:.+]]: !stream.timepoint, %[[SOURCE:.+]]: !stream.resource<transient>)
-func.func @parameterScatter(%wait: !stream.timepoint, %source: !stream.resource<transient>) -> !stream.timepoint {
+util.func private @parameterScatter(%wait: !stream.timepoint, %source: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c52_i64 = arith.constant 52 : i64
@@ -147,5 +147,5 @@
     %source[%c101 for %c201] : !stream.resource<transient>{%c300} -> "scope"::"key1"[%c51_i64],
     %source[%c102 for %c202] : !stream.resource<transient>{%c300} -> "scope"::"key2"[%c52_i64]
   } => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_folding.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_folding.mlir
index 8db5ea7..0220256 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_folding.mlir

@@ -1,20 +1,20 @@
 // RUN: iree-opt --split-input-file --canonicalize %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @FoldResourceSizeOp
-func.func @FoldResourceSizeOp(%arg0: !stream.resource<staging>, %arg1: index) -> (index, i32) {
+util.func private @FoldResourceSizeOp(%arg0: !stream.resource<staging>, %arg1: index) -> (index, i32) {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: stream.resource.size
   %0 = stream.resource.size %arg0 : !stream.resource<staging>
   // CHECK: %[[LOAD:.+]] = stream.resource.load
   %1 = stream.resource.load %arg0[%c0] : !stream.resource<staging>{%arg1} -> i32
-  // CHECK: return %arg1, %[[LOAD]]
-  return %0, %1 : index, i32
+  // CHECK: util.return %arg1, %[[LOAD]]
+  util.return %0, %1 : index, i32
 }
 
 // -----
 
 // CHECK-LABEL: @SelectResourceSizeOp
-func.func @SelectResourceSizeOp(%arg0: !stream.resource<staging>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: i1) -> (!stream.resource<staging>, index) {
+util.func private @SelectResourceSizeOp(%arg0: !stream.resource<staging>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: i1) -> (!stream.resource<staging>, index) {
   // CHECK: %[[ARG0_T:.+]] = stream.async.transfer %arg0 {{.+}} -> !stream.resource<*>{%[[ARG0_SZ:.+]]}
   %0 = stream.async.transfer %arg0 : !stream.resource<staging>{%arg1} -> !stream.resource<*>{%arg1}
   // CHECK: %[[ARG2_T:.+]] = stream.async.transfer %arg2 {{.+}} -> !stream.resource<*>{%[[ARG2_SZ:.+]]}
@@ -25,13 +25,13 @@
   %3 = stream.resource.size %2 : !stream.resource<*>
   // CHECK: = stream.async.transfer %[[RET_T]] : !stream.resource<*>{%[[RET_SIZE]]}
   %4 = stream.async.transfer %2 : !stream.resource<*>{%3} -> !stream.resource<staging>{%3}
-  return %4, %3 : !stream.resource<staging>, index
+  util.return %4, %3 : !stream.resource<staging>, index
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubviewIntoLoadOp
-func.func @FoldSubviewIntoLoadOp(%arg0: !stream.resource<staging>, %arg1: index) -> i32 {
+util.func private @FoldSubviewIntoLoadOp(%arg0: !stream.resource<staging>, %arg1: index) -> i32 {
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -39,14 +39,14 @@
   %0 = stream.resource.subview %arg0[%c128] : !stream.resource<staging>{%arg1} -> !stream.resource<staging>{%c256}
   // CHECK: = stream.resource.load %arg0[%c192] : !stream.resource<staging>{%arg1} -> i32
   %1 = stream.resource.load %0[%c64] : !stream.resource<staging>{%c256} -> i32
-  return %1 : i32
+  util.return %1 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @DontFoldSubviewIntoLoadAcrossAwaitOp
 // CHECK-SAME: (%[[SOURCE:.+]]: !stream.resource<staging>, %[[SIZE:.+]]: index, %[[FENCE:.+]]: !stream.timepoint)
-func.func @DontFoldSubviewIntoLoadAcrossAwaitOp(%source: !stream.resource<staging>, %size: index, %fence: !stream.timepoint) -> i32 {
+util.func private @DontFoldSubviewIntoLoadAcrossAwaitOp(%source: !stream.resource<staging>, %size: index, %fence: !stream.timepoint) -> i32 {
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -56,13 +56,13 @@
   %1 = stream.timepoint.await %fence => %0 : !stream.resource<staging>{%c256}
   // CHECK: = stream.resource.load %[[READY]][%c192] : !stream.resource<staging>{%[[SIZE]]} -> i32
   %2 = stream.resource.load %1[%c64] : !stream.resource<staging>{%c256} -> i32
-  return %2 : i32
+  util.return %2 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubviewIntoStoreOp
-func.func @FoldSubviewIntoStoreOp(%arg0: !stream.resource<staging>, %arg1: index) {
+util.func private @FoldSubviewIntoStoreOp(%arg0: !stream.resource<staging>, %arg1: index) {
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -71,7 +71,7 @@
   %0 = stream.resource.subview %arg0[%c128] : !stream.resource<staging>{%arg1} -> !stream.resource<staging>{%c256}
   // CHECK: stream.resource.store %c123_i32, %arg0[%c192] : i32 -> !stream.resource<staging>{%arg1}
   stream.resource.store %c123_i32, %0[%c64] : i32 -> !stream.resource<staging>{%c256}
-  return
+  util.return
 }
 
 // -----
@@ -79,11 +79,11 @@
 // A pack with no slices folds to a zero-length slab.
 
 // CHECK-LABEL: @FoldResourcePackOpEmpty
-func.func @FoldResourcePackOpEmpty(%allocator: !hal.allocator) -> index {
+util.func private @FoldResourcePackOpEmpty(%allocator: !hal.allocator) -> index {
   // CHECK-NEXT: %[[ZERO_LENGTH:.+]] = arith.constant 0
   %total_length = stream.resource.pack slices({}) : index
-  // CHECK-NEXT: return %[[ZERO_LENGTH]]
-  return %total_length : index
+  // CHECK-NEXT: util.return %[[ZERO_LENGTH]]
+  util.return %total_length : index
 }
 
 // -----
@@ -93,7 +93,7 @@
 // CHECK-LABEL: @FoldResourcePackOpOneSlice
 // CHECK-SAME: %[[OFFSET:.+]]: index,
 // CHECK-SAME: %[[SIZE:.+]]: index
-func.func @FoldResourcePackOpOneSlice(%offset: index, %size: index) -> (index, index) {
+util.func private @FoldResourcePackOpOneSlice(%offset: index, %size: index) -> (index, index) {
   // CHECK-NOT: stream.resource.pack
   %total_length, %offset_0 =
       stream.resource.pack
@@ -101,8 +101,8 @@
         slices({
           [0, 4] = %size
         }) : index
-  // CHECK: return %[[SIZE]], %[[OFFSET]]
-  return %total_length, %offset_0 : index, index
+  // CHECK: util.return %[[SIZE]], %[[OFFSET]]
+  util.return %total_length, %offset_0 : index, index
 }
 
 // -----
@@ -110,7 +110,7 @@
 // A constant zero offset operand gets dropped.
 
 // CHECK-LABEL: @PropagateResourcePackZeroOffset
-func.func @PropagateResourcePackZeroOffset(%size : index) -> (index, index, index) {
+util.func private @PropagateResourcePackZeroOffset(%size : index) -> (index, index, index) {
   // CHECK-NOT: constant 0
   // CHECK-NEXT: = stream.resource.pack slices({
   %base_offset = arith.constant 0 : index
@@ -121,7 +121,7 @@
           [0, 4] = %size,
           [1, 2] = %size,
         }) : index
-  return %total_length, %offset_0, %offset_1 : index, index, index
+  util.return %total_length, %offset_0, %offset_1 : index, index, index
 }
 
 // -----
@@ -131,7 +131,7 @@
 // CHECK-LABEL: @PropagateResourcePackBaseOffset
 // CHECK-SAME: %[[BASE_OFFSET:.+]]: index,
 // CHECK-SAME: %[[SIZE:.+]]: index
-func.func @PropagateResourcePackBaseOffset(%base_offset: index, %size : index) -> (index, index, index) {
+util.func private @PropagateResourcePackBaseOffset(%base_offset: index, %size : index) -> (index, index, index) {
   // CHECK-NEXT: %[[PACKED:.+]]:3 =
   %total_length, %offset_0, %offset_1 =
       // CHECK-SAME: stream.resource.pack slices({
@@ -143,8 +143,8 @@
         }) : index
   //      CHECK: %[[ADJUSTED_0:.+]] = arith.addi %[[BASE_OFFSET]], %[[PACKED]]#1
   // CHECK-NEXT: %[[ADJUSTED_1:.+]] = arith.addi %[[BASE_OFFSET]], %[[PACKED]]#2
-  // CHECK-NEXT: return %[[PACKED]]#0, %[[ADJUSTED_0]], %[[ADJUSTED_1]]
-  return %total_length, %offset_0, %offset_1 : index, index, index
+  // CHECK-NEXT: util.return %[[PACKED]]#0, %[[ADJUSTED_0]], %[[ADJUSTED_1]]
+  util.return %total_length, %offset_0, %offset_1 : index, index, index
 }
 
 // -----
@@ -153,7 +153,7 @@
 
 // CHECK-LABEL: @CanonicalizeResourcePackIntervals
 // CHECK-SAME: %[[SIZE:.+]]: index
-func.func @CanonicalizeResourcePackIntervals(%size : index) -> (index, index, index) {
+util.func private @CanonicalizeResourcePackIntervals(%size : index) -> (index, index, index) {
   // CHECK-NEXT: %[[PACKED:.+]]:3 =
   %total_length, %offset_0, %offset_1 =
       // CHECK-SAME: stream.resource.pack slices({
@@ -164,25 +164,25 @@
           [1, 2] = %size,
           [0, 4] = %size,
         }) : index
-  // CHECK: return %[[PACKED]]#0, %[[PACKED]]#2, %[[PACKED]]#1
-  return %total_length, %offset_0, %offset_1 : index, index, index
+  // CHECK: util.return %[[PACKED]]#0, %[[PACKED]]#2, %[[PACKED]]#1
+  util.return %total_length, %offset_0, %offset_1 : index, index, index
 }
 
 // -----
 
 // CHECK-LABEL: @FoldResourceSubviewOp
-func.func @FoldResourceSubviewOp(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @FoldResourceSubviewOp(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: stream.resource.subview
   %0 = stream.resource.subview %arg0[%c0] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%arg1}
-  // CHECK: return %arg0
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %arg0
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @FoldResourceSubviewOps
-func.func @FoldResourceSubviewOps(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @FoldResourceSubviewOps(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c100 = arith.constant 100 : index
   %c300 = arith.constant 300 : index
   %c400 = arith.constant 400 : index
@@ -191,14 +191,14 @@
   %0 = stream.resource.subview %arg0[%c100] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%c500}
   %1 = stream.resource.subview %0[%c100] : !stream.resource<*>{%c500} -> !stream.resource<*>{%c400}
   %2 = stream.resource.subview %1[%c100] : !stream.resource<*>{%c400} -> !stream.resource<*>{%c300}
-  // CHECK-NEXT: return %[[RET]]
-  return %2 : !stream.resource<*>
+  // CHECK-NEXT: util.return %[[RET]]
+  util.return %2 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @SinkSubviewAcrossSelectOps
-func.func @SinkSubviewAcrossSelectOps(%arg0: !stream.resource<*>, %arg1: i1) -> !stream.resource<*> {
+util.func private @SinkSubviewAcrossSelectOps(%arg0: !stream.resource<*>, %arg1: i1) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -209,8 +209,8 @@
   // CHECK: %[[OFFSET:.+]] = arith.select %arg1, %c0, %c128 : index
   %2 = arith.select %arg1, %0, %1 : !stream.resource<*>
   // CHECK-NEXT: %[[SUBVIEW:.+]] = stream.resource.subview %arg0[%[[OFFSET]]] : !stream.resource<*>{%c256} -> !stream.resource<*>{%c128}
-  // CHECK-NEXT: return %[[SUBVIEW]]
-  return %2 : !stream.resource<*>
+  // CHECK-NEXT: util.return %[[SUBVIEW]]
+  util.return %2 : !stream.resource<*>
 }
 
 // -----
@@ -219,9 +219,9 @@
 
 // CHECK-LABEL: unrealizedCastCleanup
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<transient>, %[[ARG1:.+]]: index)
-func.func @unrealizedCastCleanup(%arg0: !stream.resource<transient>, %arg1: index) -> (!stream.resource<transient>, index) {
+util.func private @unrealizedCastCleanup(%arg0: !stream.resource<transient>, %arg1: index) -> (!stream.resource<transient>, index) {
   %0 = builtin.unrealized_conversion_cast %arg0, %arg1 : !stream.resource<transient>, index to !stream.resource<transient>
   %1 = stream.resource.size %0 : !stream.resource<transient>
-  // CHECK-NEXT: return %[[ARG0]], %[[ARG1]]
-  return %0, %1 : !stream.resource<transient>, index
+  // CHECK-NEXT: util.return %[[ARG0]], %[[ARG1]]
+  util.return %0, %1 : !stream.resource<transient>, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_ops.mlir
index f19f530..6121ee3 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_ops.mlir

@@ -1,79 +1,79 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @resourceAlloc
-func.func @resourceAlloc(%arg0: index) -> !stream.resource<*> {
+util.func private @resourceAlloc(%arg0: index) -> !stream.resource<*> {
   // CHECK: = stream.resource.alloc uninitialized : !stream.resource<*>{%arg0}
   %0 = stream.resource.alloc uninitialized : !stream.resource<*>{%arg0}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @resourceAlloca
-func.func @resourceAlloca(%arg0: index, %await_timepoint: !stream.timepoint) -> (!stream.resource<staging>, !stream.timepoint, !stream.resource<staging>, !stream.timepoint) {
+util.func private @resourceAlloca(%arg0: index, %await_timepoint: !stream.timepoint) -> (!stream.resource<staging>, !stream.timepoint, !stream.resource<staging>, !stream.timepoint) {
   // CHECK: = stream.resource.alloca uninitialized : !stream.resource<staging>{%arg0} => !stream.timepoint
   %0:2 = stream.resource.alloca uninitialized : !stream.resource<staging>{%arg0} => !stream.timepoint
   // CHECK: = stream.resource.alloca uninitialized await(%arg1) => !stream.resource<staging>{%arg0} => !stream.timepoint
   %1:2 = stream.resource.alloca uninitialized await(%await_timepoint) => !stream.resource<staging>{%arg0} => !stream.timepoint
-  return %0#0, %0#1, %1#0, %1#1 : !stream.resource<staging>, !stream.timepoint, !stream.resource<staging>, !stream.timepoint
+  util.return %0#0, %0#1, %1#0, %1#1 : !stream.resource<staging>, !stream.timepoint, !stream.resource<staging>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @resourceDealloca
-func.func @resourceDealloca(%arg0: index, %arg1: !stream.resource<staging>, %arg2: !stream.timepoint) {
+util.func private @resourceDealloca(%arg0: index, %arg1: !stream.resource<staging>, %arg2: !stream.timepoint) {
   // CHECK: = stream.resource.dealloca %arg1 : !stream.resource<staging>{%arg0} => !stream.timepoint
   stream.resource.dealloca %arg1 : !stream.resource<staging>{%arg0} => !stream.timepoint
   // CHECK: = stream.resource.dealloca await(%arg2) => %arg1 : !stream.resource<staging>{%arg0} => !stream.timepoint
   stream.resource.dealloca await(%arg2) => %arg1 : !stream.resource<staging>{%arg0} => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @resourceSize
-func.func @resourceSize(%arg0: !stream.resource<*>) -> index {
+util.func private @resourceSize(%arg0: !stream.resource<*>) -> index {
   // CHECK: = stream.resource.size %arg0 : !stream.resource<*>
   %0 = stream.resource.size %arg0 : !stream.resource<*>
-  return %0 : index
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @resourceTryMap
-func.func @resourceTryMap(%arg0: !util.buffer) -> (i1, !stream.resource<constant>) {
+util.func private @resourceTryMap(%arg0: !util.buffer) -> (i1, !stream.resource<constant>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: = stream.resource.try_map %arg0[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128}
   %0:2 = stream.resource.try_map %arg0[%c0] : !util.buffer -> i1, !stream.resource<constant>{%c128}
-  return %0#0, %0#1 : i1, !stream.resource<constant>
+  util.return %0#0, %0#1 : i1, !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @resourceLoad
-func.func @resourceLoad(%arg0: !stream.resource<staging>, %arg1: index) -> i32 {
+util.func private @resourceLoad(%arg0: !stream.resource<staging>, %arg1: index) -> i32 {
   %c0 = arith.constant 0 : index
   // CHECK: = stream.resource.load %arg0[%c0] : !stream.resource<staging>{%arg1} -> i32
   %0 = stream.resource.load %arg0[%c0] : !stream.resource<staging>{%arg1} -> i32
-  return %0 : i32
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @resourceStore
-func.func @resourceStore(%arg0: !stream.resource<staging>, %arg1: index) {
+util.func private @resourceStore(%arg0: !stream.resource<staging>, %arg1: index) {
   %c0 = arith.constant 0 : index
   %c123_i32 = arith.constant 123 : i32
   // CHECK: stream.resource.store %c123_i32, %arg0[%c0] : i32 -> !stream.resource<staging>{%arg1}
   stream.resource.store %c123_i32, %arg0[%c0] : i32 -> !stream.resource<staging>{%arg1}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @resourcePack
-func.func @resourcePack(%arg0: index, %arg1: index) -> (index, index, index) {
+util.func private @resourcePack(%arg0: index, %arg1: index) -> (index, index, index) {
   %c128 = arith.constant 128 : index
   //      CHECK: stream.resource.pack offset(%c128) slices({
   // CHECK-NEXT:   [0, 9] = %arg0,
@@ -83,13 +83,13 @@
     [0, 9] = %arg0,
     [3, 8] = %arg1,
   }) : index
-  return %0#0, %0#1, %0#2 : index, index, index
+  util.return %0#0, %0#1, %0#2 : index, index, index
 }
 
 // -----
 
 // CHECK-LABEL: @resourceConstants
-func.func @resourceConstants() -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
+util.func private @resourceConstants() -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
   //      CHECK: = stream.resource.constants :
@@ -100,16 +100,16 @@
     !stream.resource<constant>{%c4} = dense<100> : tensor<1xi32>,
     !stream.resource<constant>{%c8} = dense<[101, 102]> : tensor<2xi32>
     => !stream.timepoint
-  return %0#0, %0#1, %0#2 : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
+  util.return %0#0, %0#1, %0#2 : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @resourceSubview
-func.func @resourceSubview(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @resourceSubview(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
   // CHECK: = stream.resource.subview %arg0[%c128] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%c256}
   %0 = stream.resource.subview %arg0[%c128] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%c256}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_folding.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_folding.mlir
index 1d6aadf..08ad4a1 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_folding.mlir

@@ -1,201 +1,201 @@
 // RUN: iree-opt --split-input-file --canonicalize %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @FoldTensorImportOp
-func.func @FoldTensorImportOp(%arg0: !stream.resource<external>, %arg1: index) -> !stream.resource<external> {
+util.func private @FoldTensorImportOp(%arg0: !stream.resource<external>, %arg1: index) -> !stream.resource<external> {
   // CHECK-NOT: stream.tensor.import
   // CHECK-NOT: stream.tensor.export
-  // CHECK: return %arg0 : !stream.resource<external>
+  // CHECK: util.return %arg0 : !stream.resource<external>
   %c20 = arith.constant 20 : index
   %0 = stream.tensor.export %arg0 : tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20} -> !hal.buffer_view
   %1 = stream.tensor.import %0 : !hal.buffer_view -> tensor<1x?x5xf32>{%arg1} in !stream.resource<external>{%c20}
-  return %1 : !stream.resource<external>
+  util.return %1 : !stream.resource<external>
 }
 
 // -----
 
 // CHECK-LABEL: @FoldTensorExportOp
-func.func @FoldTensorExportOp(%arg0: !hal.buffer_view, %arg1: index) -> !hal.buffer_view {
+util.func private @FoldTensorExportOp(%arg0: !hal.buffer_view, %arg1: index) -> !hal.buffer_view {
   // CHECK-NOT: stream.tensor.import
   // CHECK-NOT: stream.tensor.export
-  // CHECK: return %arg0 : !hal.buffer_view
+  // CHECK: util.return %arg0 : !hal.buffer_view
   %c20 = arith.constant 20 : index
   %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20}
   %1 = stream.tensor.export %0 : tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20} -> !hal.buffer_view
-  return %1 : !hal.buffer_view
+  util.return %1 : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @NofoldTensorExportOpBufferToView
-func.func @NofoldTensorExportOpBufferToView(%arg0: !hal.buffer, %arg1: index) -> !hal.buffer_view {
+util.func private @NofoldTensorExportOpBufferToView(%arg0: !hal.buffer, %arg1: index) -> !hal.buffer_view {
   // CHECK: %[[IMPORT:.+]] = stream.tensor.import
   // CHECK: %[[EXPORT:.+]] = stream.tensor.export %[[IMPORT]]
-  // CHECK: return %[[EXPORT]] : !hal.buffer_view
+  // CHECK: util.return %[[EXPORT]] : !hal.buffer_view
   %c20 = arith.constant 20 : index
   %0 = stream.tensor.import %arg0 : !hal.buffer -> tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20}
   %1 = stream.tensor.export %0 : tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20} -> !hal.buffer_view
-  return %1 : !hal.buffer_view
+  util.return %1 : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @KeepTensorExportOpWithDifferingEncodings
-func.func @KeepTensorExportOpWithDifferingEncodings(%arg0: !hal.buffer_view, %arg1: index) -> !hal.buffer_view {
+util.func private @KeepTensorExportOpWithDifferingEncodings(%arg0: !hal.buffer_view, %arg1: index) -> !hal.buffer_view {
   // CHECK: %[[IMPORT:.+]] = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20}
   // CHECK: %[[EXPORT:.+]] = stream.tensor.export %[[IMPORT]] : tensor<1x?x5xf32>{%arg1} in !stream.resource<external>{%c20} -> !hal.buffer_view
-  // CHECK: return %[[EXPORT]] : !hal.buffer_view
+  // CHECK: util.return %[[EXPORT]] : !hal.buffer_view
   %c20 = arith.constant 20 : index
   %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20}
   %1 = stream.tensor.export %0 : tensor<1x?x5xf32>{%arg1} in !stream.resource<external>{%c20} -> !hal.buffer_view
-  return %1 : !hal.buffer_view
+  util.return %1 : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @TensorConstantToEmpty
-func.func @TensorConstantToEmpty(%arg0: index) -> !stream.resource<constant> {
+util.func private @TensorConstantToEmpty(%arg0: index) -> !stream.resource<constant> {
   // CHECK: %[[EMPTY:.+]] = stream.tensor.empty : tensor<2x0x?xf32>{%arg0} in !stream.resource<constant>
-  // CHECK: return %[[EMPTY]]
+  // CHECK: util.return %[[EMPTY]]
   // CHECK-NOT: stream.tensor.constant
   %cst = stream.tensor.constant : tensor<2x0x?xf32>{%arg0} in !stream.resource<constant> = dense<> : tensor<2x0x4xf32>
-  return %cst : !stream.resource<constant>
+  util.return %cst : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @TensorConstantToEmptyDynamic
-func.func @TensorConstantToEmptyDynamic() -> !stream.resource<constant> {
+util.func private @TensorConstantToEmptyDynamic() -> !stream.resource<constant> {
   // CHECK: %[[EMPTY:.+]] = stream.tensor.empty : tensor<2x?xf32>{%c0} in !stream.resource<constant>
-  // CHECK: return %[[EMPTY]]
+  // CHECK: util.return %[[EMPTY]]
   // CHECK-NOT: stream.tensor.constant
   %c0 = arith.constant 0 : index
   %cst = stream.tensor.constant : tensor<2x?xf32>{%c0} in !stream.resource<constant> = dense<> : tensor<2x0xf32>
-  return %cst : !stream.resource<constant>
+  util.return %cst : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @TensorConstantToSplat
-func.func @TensorConstantToSplat() -> !stream.resource<constant> {
+util.func private @TensorConstantToSplat() -> !stream.resource<constant> {
   // CHECK-DAG: %[[CST:.+]] = arith.constant 1.000000e+00 : f32
   // CHECK-DAG: %[[SIZE:.+]] = stream.tensor.sizeof tensor<2x2xf32> : index
   // CHECK: = stream.tensor.splat %[[CST]] : f32 -> tensor<2x2xf32> in !stream.resource<*>{%[[SIZE]]}
   %cst = stream.tensor.constant : tensor<2x2xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<2x2xf32>
-  return %cst : !stream.resource<constant>
+  util.return %cst : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @TensorComplexConstantToSplat
-func.func @TensorComplexConstantToSplat() -> !stream.resource<constant> {
+util.func private @TensorComplexConstantToSplat() -> !stream.resource<constant> {
   // CHECK-DAG: %[[CST:.+]] = complex.constant [2.000000e+00 : f32, 3.000000e+00 : f32] : complex<f32>
   // CHECK-DAG: %[[SIZE:.+]] = stream.tensor.sizeof tensor<2x2xcomplex<f32>> : index
   // CHECK: = stream.tensor.splat %[[CST]] : complex<f32> -> tensor<2x2xcomplex<f32>> in !stream.resource<*>{%[[SIZE]]}
   %cst = stream.tensor.constant : tensor<2x2xcomplex<f32>> in !stream.resource<constant> = dense<(2.000000e+00,3.000000e+00)> : tensor<2x2xcomplex<f32>>
-  return %cst : !stream.resource<constant>
+  util.return %cst : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @NarrowSplatPatternI32ToI8
-func.func @NarrowSplatPatternI32ToI8() -> !stream.resource<*> {
+util.func private @NarrowSplatPatternI32ToI8() -> !stream.resource<*> {
   %c100 = arith.constant 100 : index
   %pattern = arith.constant 0xAAAAAAAA : i32
   // CHECK: stream.tensor.splat %c-86_i8 : i8
   %0 = stream.tensor.splat %pattern : i32 -> tensor<2x2xf32> in !stream.resource<*>{%c100}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @NarrowSplatPatternI32ToI16
-func.func @NarrowSplatPatternI32ToI16() -> !stream.resource<*> {
+util.func private @NarrowSplatPatternI32ToI16() -> !stream.resource<*> {
   %c100 = arith.constant 100 : index
   %pattern = arith.constant 0xAABBAABB : i32
   // CHECK: stream.tensor.splat %c-21829_i16 : i16
   %0 = stream.tensor.splat %pattern : i32 -> tensor<2x2xf32> in !stream.resource<*>{%c100}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @NarrowSplatPatternI64ToI8
-func.func @NarrowSplatPatternI64ToI8() -> !stream.resource<*> {
+util.func private @NarrowSplatPatternI64ToI8() -> !stream.resource<*> {
   %c100 = arith.constant 100 : index
   %pattern = arith.constant 0 : i64
   // CHECK: stream.tensor.splat %c0_i8 : i8
   %0 = stream.tensor.splat %pattern : i64 -> tensor<2x2xf32> in !stream.resource<*>{%c100}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @NarrowSplatPatternI64ToI16
-func.func @NarrowSplatPatternI64ToI16() -> !stream.resource<*> {
+util.func private @NarrowSplatPatternI64ToI16() -> !stream.resource<*> {
   %c100 = arith.constant 100 : index
   %pattern = arith.constant 0xAABBAABBAABBAABB : i64
   // CHECK: stream.tensor.splat %c-21829_i16 : i16
   %0 = stream.tensor.splat %pattern : i64 -> tensor<2x2xf32> in !stream.resource<*>{%c100}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @NarrowSplatPatternI64ToI32
-func.func @NarrowSplatPatternI64ToI32() -> !stream.resource<*> {
+util.func private @NarrowSplatPatternI64ToI32() -> !stream.resource<*> {
   %c100 = arith.constant 100 : index
   %pattern = arith.constant 0xAABBCCDDAABBCCDD : i64
   // CHECK: stream.tensor.splat %c12307677_i32
   %0 = stream.tensor.splat %pattern : i64 -> tensor<2x2xf32> in !stream.resource<*>{%c100}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @NarrowSplatPatternBF16
-func.func @NarrowSplatPatternBF16() -> !stream.resource<*> {
+util.func private @NarrowSplatPatternBF16() -> !stream.resource<*> {
   %c100 = arith.constant 100 : index
   %pattern = arith.constant 0.0 : bf16
   // CHECK: stream.tensor.splat %c0_i8 : i8
   %0 = stream.tensor.splat %pattern : bf16 -> tensor<2x2xf32> in !stream.resource<*>{%c100}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @NarrowSplatPatternF32
-func.func @NarrowSplatPatternF32() -> !stream.resource<*> {
+util.func private @NarrowSplatPatternF32() -> !stream.resource<*> {
   %c100 = arith.constant 100 : index
   %pattern = arith.constant 0.0 : f32
   // CHECK: stream.tensor.splat %c0_i8 : i8
   %0 = stream.tensor.splat %pattern : f32 -> tensor<2x2xf32> in !stream.resource<*>{%c100}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @FoldTensorCloneOp
-func.func @FoldTensorCloneOp(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
+util.func private @FoldTensorCloneOp(%arg0: !stream.resource<*>, %arg1: index) -> !stream.resource<*> {
   // CHECK-NOT: stream.tensor.clone
   %0 = stream.tensor.clone %arg0 : tensor<2x2xf32> in !stream.resource<*>{%arg1} -> tensor<2x2xf32> in !stream.resource<*>{%arg1}
-  // CHECK: return %arg0
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %arg0
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @NofoldTensorCloneOp
-func.func @NofoldTensorCloneOp(%arg0: !stream.resource<external>, %arg1: index) -> !stream.resource<*> {
+util.func private @NofoldTensorCloneOp(%arg0: !stream.resource<external>, %arg1: index) -> !stream.resource<*> {
   // CHECK: %[[CLONE:.+]] = stream.tensor.clone
   %0 = stream.tensor.clone %arg0 : tensor<2x2xf32> in !stream.resource<external>{%arg1} -> tensor<2x2xf32> in !stream.resource<*>{%arg1}
-  // CHECK: return %[[CLONE]] : !stream.resource<*>
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[CLONE]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @ElideUnneededTensorClones
-func.func @ElideUnneededTensorClones(%arg0: !stream.resource<*>, %arg1: index) -> f32 {
+util.func private @ElideUnneededTensorClones(%arg0: !stream.resource<*>, %arg1: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK-NOT: stream.tensor.clone
   %0 = stream.tensor.clone %arg0 : tensor<2x2xf32> in !stream.resource<*>{%arg1} -> tensor<2x2xf32> in !stream.resource<*>{%arg1}
@@ -203,6 +203,6 @@
   %1 = stream.async.transfer %0 : !stream.resource<*>{%arg1} -> !stream.resource<staging>{%arg1}
   // CHECK: %[[T1:.+]] = stream.tensor.load %[[T0]][%c0, %c0] : tensor<2x2xf32> in !stream.resource<staging>{%arg1} -> f32
   %2 = stream.tensor.load %1[%c0, %c0] : tensor<2x2xf32> in !stream.resource<staging>{%arg1} -> f32
-  // CHECK: return %[[T1]]
-  return %2 : f32
+  // CHECK: util.return %[[T1]]
+  util.return %2 : f32
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_ops.mlir
index 4cc586c..a224e4a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/tensor_ops.mlir

@@ -1,146 +1,146 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @tensorImport
-func.func @tensorImport(%arg0: !hal.buffer_view, %arg1: index) -> !stream.resource<external> {
+util.func private @tensorImport(%arg0: !hal.buffer_view, %arg1: index) -> !stream.resource<external> {
   %c20 = arith.constant 20 : index
   // CHECK: = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20}
   %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<?x5xf32>{%arg1} in !stream.resource<external>{%c20}
-  return %0 : !stream.resource<external>
+  util.return %0 : !stream.resource<external>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorExport
-func.func @tensorExport(%arg0: !stream.resource<external>, %arg1: index) -> !hal.buffer_view {
+util.func private @tensorExport(%arg0: !stream.resource<external>, %arg1: index) -> !hal.buffer_view {
   %c200 = arith.constant 200 : index
   // CHECK: = stream.tensor.export %arg0 : tensor<?x1x10xf32>{%arg1} in !stream.resource<external>{%c200} -> !hal.buffer_view
   %0 = stream.tensor.export %arg0 : tensor<?x1x10xf32>{%arg1} in !stream.resource<external>{%c200} -> !hal.buffer_view
-  return %0 : !hal.buffer_view
+  util.return %0 : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @tensorSizeOf
-func.func @tensorSizeOf(%arg0: index) -> index {
+util.func private @tensorSizeOf(%arg0: index) -> index {
   // CHECK: = stream.tensor.sizeof tensor<?x5xf32>{%arg0} : index
   %0 = stream.tensor.sizeof tensor<?x5xf32>{%arg0} : index
-  return %0 : index
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @tensorEmpty
-func.func @tensorEmpty(%arg0: index, %arg1: index) -> !stream.resource<*> {
+util.func private @tensorEmpty(%arg0: index, %arg1: index) -> !stream.resource<*> {
   // CHECK: = stream.tensor.empty : tensor<?x0xf32>{%arg0} in !stream.resource<*>{%arg1}
   %0 = stream.tensor.empty : tensor<?x0xf32>{%arg0} in !stream.resource<*>{%arg1}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorConstant
-func.func @tensorConstant(%arg0: index) -> !stream.resource<constant> {
+util.func private @tensorConstant(%arg0: index) -> !stream.resource<constant> {
   // CHECK: = stream.tensor.constant : tensor<?x5x64xf32>{%arg0} in !stream.resource<constant> = dense<0.000000e+00> : tensor<1x5x64xf32>
   %0 = stream.tensor.constant : tensor<?x5x64xf32>{%arg0} in !stream.resource<constant> = dense<0.000000e+00> : tensor<1x5x64xf32>
-  return %0 : !stream.resource<constant>
+  util.return %0 : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorSplat
-func.func @tensorSplat(%arg0: f32, %arg1: index, %arg2: index) -> !stream.resource<*> {
+util.func private @tensorSplat(%arg0: f32, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: = stream.tensor.splat %arg0 : f32 -> tensor<?x1x10xf32>{%arg1} in !stream.resource<*>{%arg2}
   %0 = stream.tensor.splat %arg0 : f32 -> tensor<?x1x10xf32>{%arg1} in !stream.resource<*>{%arg2}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorClone
-func.func @tensorClone(%arg0: !stream.resource<*>, %arg1: index, %arg2: index) -> !stream.resource<*> {
+util.func private @tensorClone(%arg0: !stream.resource<*>, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: = stream.tensor.clone %arg0 : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2}
   %0 = stream.tensor.clone %arg0 : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorSlice
-func.func @tensorSlice(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> !stream.resource<*> {
+util.func private @tensorSlice(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: = stream.tensor.slice %arg0[%c0, %c1 for %arg3, %c1] : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x1xf32>{%arg3} in !stream.resource<*>{%arg4}
   %0 = stream.tensor.slice %arg0[%c0, %c1 for %arg3, %c1] : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x1xf32>{%arg3} in !stream.resource<*>{%arg4}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorFill
-func.func @tensorFill(%arg0: f32, %arg1: !stream.resource<*>, %arg2: index, %arg3: index) -> !stream.resource<*> {
+util.func private @tensorFill(%arg0: f32, %arg1: !stream.resource<*>, %arg2: index, %arg3: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : f32 -> tensor<?x4xf32>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
   %0 = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : f32 -> tensor<?x4xf32>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorUpdate
-func.func @tensorUpdate(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index) -> !stream.resource<*> {
+util.func private @tensorUpdate(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: = stream.tensor.update %arg0, %arg2[%c0, %c0] : tensor<2x2xf32> in !stream.resource<*>{%arg1} -> tensor<?x4xf32>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
   %0 = stream.tensor.update %arg0, %arg2[%c0, %c0] : tensor<2x2xf32> in !stream.resource<*>{%arg1} -> tensor<?x4xf32>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorLoad
-func.func @tensorLoad(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index) -> f32 {
+util.func private @tensorLoad(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: = stream.tensor.load %arg0[%c0] : tensor<?xf32>{%arg1} in !stream.resource<staging>{%arg2} -> f32
   %0 = stream.tensor.load %arg0[%c0] : tensor<?xf32>{%arg1} in !stream.resource<staging>{%arg2} -> f32
-  return %0 : f32
+  util.return %0 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @tensorLoadRank0
-func.func @tensorLoadRank0(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
+util.func private @tensorLoadRank0(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: = stream.tensor.load %arg0 : tensor<f32> in !stream.resource<staging>{%arg1} -> f32
   %0 = stream.tensor.load %arg0 : tensor<f32> in !stream.resource<staging>{%arg1} -> f32
-  return %0 : f32
+  util.return %0 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @tensorStore
-func.func @tensorStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: f32) -> !stream.resource<staging> {
+util.func private @tensorStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: f32) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   // CHECK: = stream.tensor.store %arg3, %arg0[%c0] : f32 -> tensor<?xf32>{%arg1} in %arg0 as !stream.resource<staging>{%arg2}
   %0 = stream.tensor.store %arg3, %arg0[%c0] : f32 -> tensor<?xf32>{%arg1} in %arg0 as !stream.resource<staging>{%arg2}
-  return %0 : !stream.resource<staging>
+  util.return %0 : !stream.resource<staging>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorStoreRank0
-func.func @tensorStoreRank0(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
+util.func private @tensorStoreRank0(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   // CHECK: = stream.tensor.store %arg2, %arg0 : f32 -> tensor<f32> in %arg0 as !stream.resource<staging>{%arg1}
   %0 = stream.tensor.store %arg2, %arg0 : f32 -> tensor<f32> in %arg0 as !stream.resource<staging>{%arg1}
-  return %0 : !stream.resource<staging>
+  util.return %0 : !stream.resource<staging>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorTrace
 //  CHECK-SAME: (%[[TENSOR0:.+]]: !stream.resource<staging>, %[[TENSOR0_SIZE:.+]]: index, %[[TENSOR1:.+]]: !stream.resource<staging>, %[[TENSOR1_SIZE:.+]]: index, %[[TENSOR1_DIM0:.+]]: index, %[[TENSOR1_DIM2:.+]]: index)
-func.func @tensorTrace(%tensor0: !stream.resource<staging>, %tensor0_size: index, %tensor1: !stream.resource<staging>, %tensor1_size: index, %tensor1_dim0: index, %tensor1_dim2: index) {
+util.func private @tensorTrace(%tensor0: !stream.resource<staging>, %tensor0_size: index, %tensor1: !stream.resource<staging>, %tensor1_size: index, %tensor1_dim0: index, %tensor1_dim2: index) {
   //      CHECK: stream.tensor.trace "FOOBAR" = [
   // CHECK-NEXT:   %[[TENSOR0]] : tensor<5xf32> in !stream.resource<staging>{%[[TENSOR0_SIZE]]},
   // CHECK-NEXT:   %[[TENSOR1]] : tensor<?x3x?xi32>{%[[TENSOR1_DIM0]], %[[TENSOR1_DIM2]]} in !stream.resource<staging>{%[[TENSOR1_SIZE]]}
@@ -149,5 +149,5 @@
     %tensor0 : tensor<5xf32> in !stream.resource<staging>{%tensor0_size},
     %tensor1 : tensor<?x3x?xi32>{%tensor1_dim0, %tensor1_dim2} in !stream.resource<staging>{%tensor1_size}
   ]
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_folding.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_folding.mlir
index 3fa662b..14ed05a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_folding.mlir

@@ -1,39 +1,39 @@
 // RUN: iree-opt --split-input-file --canonicalize %s | FileCheck %s
 
 // CHECK-LABEL: @FoldTimepointExport
-func.func @FoldTimepointExport(%arg0: !hal.semaphore, %arg1: index) -> (!hal.semaphore, index) {
+util.func private @FoldTimepointExport(%arg0: !hal.semaphore, %arg1: index) -> (!hal.semaphore, index) {
   // CHECK-NOT: stream.timepoint.import
   %0 = stream.timepoint.import %arg0, %arg1 : (!hal.semaphore, index) => !stream.timepoint
   // CHECK-NOT: stream.timepoint.export
   %1:2 = stream.timepoint.export %0 => (!hal.semaphore, index)
-  // CHECK: return %arg0, %arg1
-  return %1#0, %1#1 : !hal.semaphore, index
+  // CHECK: util.return %arg0, %arg1
+  util.return %1#0, %1#1 : !hal.semaphore, index
 }
 
 // -----
 
 // CHECK-LABEL: @DontFoldTimepointExportMismatch
-func.func @DontFoldTimepointExportMismatch(%arg0: !hal.semaphore, %arg1: index) -> (!hal.semaphore, i32) {
+util.func private @DontFoldTimepointExportMismatch(%arg0: !hal.semaphore, %arg1: index) -> (!hal.semaphore, i32) {
   // CHECK: stream.timepoint.import
   %0 = stream.timepoint.import %arg0, %arg1 : (!hal.semaphore, index) => !stream.timepoint
   // CHECK-NEXT: stream.timepoint.export
   %1:2 = stream.timepoint.export %0 => (!hal.semaphore, i32)
-  return %1#0, %1#1 : !hal.semaphore, i32
+  util.return %1#0, %1#1 : !hal.semaphore, i32
 }
 
 // -----
 
 // CHECK-LABEL: @PassThroughChainExternal
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[ARG_FENCE:.+]]: !hal.fence)
-func.func @PassThroughChainExternal(%device: !hal.device, %arg_fence: !hal.fence) -> !hal.fence {
+util.func private @PassThroughChainExternal(%device: !hal.device, %arg_fence: !hal.fence) -> !hal.fence {
   // CHECK-NOT: stream.timepoint.import
   %timepoint = stream.timepoint.import %arg_fence : (!hal.fence) => !stream.timepoint
   // CHECK-NOT: hal.fence.create
   %chained_fence = hal.fence.create device(%device : !hal.device) flags("None") : !hal.fence
   // CHECK-NOT: stream.timepoint.chain_external
   stream.timepoint.chain_external %timepoint => (%chained_fence : !hal.fence)
-  // CHECK: return %[[ARG_FENCE]]
-  return %chained_fence : !hal.fence
+  // CHECK: util.return %[[ARG_FENCE]]
+  util.return %chained_fence : !hal.fence
 }
 
 // -----
@@ -42,109 +42,109 @@
 
 // CHECK-LABEL: @DontPassThroughChainExternal
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[ARG_FENCE:.+]]: !hal.fence, %[[CHAINED_FENCE:.+]]: !hal.fence)
-func.func @DontPassThroughChainExternal(%device: !hal.device, %arg_fence: !hal.fence, %chained_fence: !hal.fence) -> !hal.fence {
+util.func private @DontPassThroughChainExternal(%device: !hal.device, %arg_fence: !hal.fence, %chained_fence: !hal.fence) -> !hal.fence {
   // CHECK: %[[TIMEPOINT:.+]] = stream.timepoint.import %[[ARG_FENCE]]
   %timepoint = stream.timepoint.import %arg_fence : (!hal.fence) => !stream.timepoint
   // CHECK: stream.timepoint.chain_external %[[TIMEPOINT]] => (%[[CHAINED_FENCE]]
   stream.timepoint.chain_external %timepoint => (%chained_fence : !hal.fence)
-  // CHECK: return %[[CHAINED_FENCE]]
-  return %chained_fence : !hal.fence
+  // CHECK: util.return %[[CHAINED_FENCE]]
+  util.return %chained_fence : !hal.fence
 }
 
 // -----
 
 // CHECK-LABEL: @FoldTimepointJoinOp
-func.func @FoldTimepointJoinOp(%arg0: !stream.timepoint) -> !stream.timepoint {
+util.func private @FoldTimepointJoinOp(%arg0: !stream.timepoint) -> !stream.timepoint {
   // CHECK-NOT: stream.timepoint.join
   %0 = stream.timepoint.join max(%arg0) => !stream.timepoint
-  // CHECK: return %arg0
-  return %0 : !stream.timepoint
+  // CHECK: util.return %arg0
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideImmediateTimepointJoinOperands
-func.func @ElideImmediateTimepointJoinOperands(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
+util.func private @ElideImmediateTimepointJoinOperands(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
   %0 = stream.timepoint.immediate => !stream.timepoint
   %1 = stream.timepoint.immediate => !stream.timepoint
   // CHECK: = stream.timepoint.join max(%arg0, %arg1)
   %2 = stream.timepoint.join max(%arg0, %0, %1, %arg1) => !stream.timepoint
-  return %2 : !stream.timepoint
+  util.return %2 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideImmediateTimepointJoinOperandsAll
-func.func @ElideImmediateTimepointJoinOperandsAll() -> !stream.timepoint {
+util.func private @ElideImmediateTimepointJoinOperandsAll() -> !stream.timepoint {
   %0 = stream.timepoint.immediate => !stream.timepoint
   %1 = stream.timepoint.immediate => !stream.timepoint
   // CHECK-NOT: stream.timepoint.join
   %2 = stream.timepoint.join max(%0, %1) => !stream.timepoint
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
-  // CHECK: return %[[IMM]]
-  return %2 : !stream.timepoint
+  // CHECK: util.return %[[IMM]]
+  util.return %2 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @FoldDuplicateTimepointJoinOperands
-func.func @FoldDuplicateTimepointJoinOperands(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
+util.func private @FoldDuplicateTimepointJoinOperands(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
   // CHECK: = stream.timepoint.join max(%arg0, %arg1)
   %0 = stream.timepoint.join max(%arg0, %arg1, %arg0, %arg1) => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ExpandTimepointJoinOperands
-func.func @ExpandTimepointJoinOperands(%arg0: !stream.timepoint, %arg1: !stream.timepoint, %arg2: !stream.timepoint, %arg3: !stream.timepoint) -> !stream.timepoint {
+util.func private @ExpandTimepointJoinOperands(%arg0: !stream.timepoint, %arg1: !stream.timepoint, %arg2: !stream.timepoint, %arg3: !stream.timepoint) -> !stream.timepoint {
   %join0 = stream.timepoint.join max(%arg0, %arg1) => !stream.timepoint
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%arg2, %arg0, %arg1, %arg3)
   %join1 = stream.timepoint.join max(%arg2, %join0, %arg3) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join1 : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join1 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideImmediateBarrier
 // CHECK-SAME: (%[[SIZE:.+]]: index)
-func.func @ElideImmediateBarrier(%size: index) -> (!stream.resource<external>, !stream.timepoint) {
+util.func private @ElideImmediateBarrier(%size: index) -> (!stream.resource<external>, !stream.timepoint) {
   // CHECK-DAG: %[[RESOURCE:.+]] = stream.resource.alloc
   %r0 = stream.resource.alloc uninitialized : !stream.resource<external>{%size}
   // CHECK-DAG: %[[FENCE:.+]] = stream.timepoint.immediate
   // CHECK-NOT: stream.timepoint.barrier
   %r1, %r1t = stream.timepoint.barrier %r0 : !stream.resource<external>{%size} => !stream.timepoint
-  // CHECK: return %[[RESOURCE]], %[[FENCE]]
-  return %r1, %r1t : !stream.resource<external>, !stream.timepoint
+  // CHECK: util.return %[[RESOURCE]], %[[FENCE]]
+  util.return %r1, %r1t : !stream.resource<external>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ChainTimepoints
 // CHECK-SAME: (%[[FENCE:.+]]: !stream.timepoint, %[[SOURCE:.+]]: !stream.resource<external>)
-func.func @ChainTimepoints(%fence: !stream.timepoint, %source: !stream.resource<external>) -> (!stream.resource<external>, !stream.timepoint) {
+util.func private @ChainTimepoints(%fence: !stream.timepoint, %source: !stream.resource<external>) -> (!stream.resource<external>, !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK-NOT: stream.timepoint.await
   %r0 = stream.timepoint.await %fence => %source : !stream.resource<external>{%c128}
   // CHECK-NOT: stream.timepoint.barrier
   %r1, %r1t = stream.timepoint.barrier %r0 : !stream.resource<external>{%c128} => !stream.timepoint
-  // CHECK: return %[[SOURCE]], %[[FENCE]]
-  return %r1, %r1t : !stream.resource<external>, !stream.timepoint
+  // CHECK: util.return %[[SOURCE]], %[[FENCE]]
+  util.return %r1, %r1t : !stream.resource<external>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @ElideImmediateHostAwaits
-func.func @ElideImmediateHostAwaits(%arg0: !stream.resource<staging>) -> !stream.resource<staging> {
+util.func private @ElideImmediateHostAwaits(%arg0: !stream.resource<staging>) -> !stream.resource<staging> {
   %c100 = arith.constant 100 : index
   // CHECK-NOT: stream.timepoint.immediate
   %0 = stream.timepoint.immediate => !stream.timepoint
   // CHECK-NOT: stream.timepoint.await
   %1 = stream.timepoint.await %0 => %arg0 : !stream.resource<staging>{%c100}
-  // CHECK: return %arg0
-  return %1 : !stream.resource<staging>
+  // CHECK: util.return %arg0
+  util.return %1 : !stream.resource<staging>
 }
 
 // -----
@@ -153,7 +153,7 @@
 // use the awaited resources.
 
 // CHECK-LABEL: @SinkAwaitToFirstConsumer
-func.func @SinkAwaitToFirstConsumer(
+util.func private @SinkAwaitToFirstConsumer(
   %arg0: i1, %arg1: i1,
   %arg2: !stream.resource<constant>,
   %arg3: !stream.resource<staging>,
@@ -183,13 +183,13 @@
   cf.br ^bb4(%2 : !stream.resource<external>)
 // CHECK: ^bb4(
 ^bb4(%arg6: !stream.resource<external>):
-  return %arg6 : !stream.resource<external>
+  util.return %arg6 : !stream.resource<external>
 }
 
 // -----
 
 // CHECK-LABEL: @SinkSubviewsAcrossAwaits
-func.func @SinkSubviewsAcrossAwaits(
+util.func private @SinkSubviewsAcrossAwaits(
   %arg0: !stream.resource<*>, %arg1: index,
   %arg2: !stream.timepoint
 ) -> !stream.resource<*> {
@@ -199,14 +199,14 @@
   // CHECK: %[[RET:.+]] = stream.resource.subview %[[READY]][%c128] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%c256}
   %0 = stream.resource.subview %arg0[%c128] : !stream.resource<*>{%arg1} -> !stream.resource<*>{%c256}
   %1 = stream.timepoint.await %arg2 => %0 : !stream.resource<*>{%c256}
-  // CHECK: return %[[RET]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @GroupAwaitsByTimepoint
-func.func @GroupAwaitsByTimepoint(
+util.func private @GroupAwaitsByTimepoint(
   %arg0: !stream.timepoint,
   %arg1: !stream.resource<*>,
   %arg2: !stream.resource<*>,
@@ -222,8 +222,8 @@
   %0 = stream.timepoint.await %arg0 => %arg1 : !stream.resource<*>{%c100}
   %1 = stream.timepoint.await %arg0 => %arg2 : !stream.resource<*>{%c101}
   %2:2 = stream.timepoint.await %arg0 => %arg3, %arg4 : !stream.resource<*>{%c102}, !stream.resource<*>{%c103}
-  // CHECK-NEXT: return %[[RET]]#0, %[[RET]]#1, %[[RET]]#2, %[[RET]]#3
-  return %0, %1, %2#0, %2#1 : !stream.resource<*>, !stream.resource<*>, !stream.resource<*>, !stream.resource<*>
+  // CHECK-NEXT: util.return %[[RET]]#0, %[[RET]]#1, %[[RET]]#2, %[[RET]]#3
+  util.return %0, %1, %2#0, %2#1 : !stream.resource<*>, !stream.resource<*>, !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
@@ -231,25 +231,25 @@
 // Tests that the pattern doesn't kick in when it would be unsafe to group the
 // awaits due to operand dependencies.
 
-func.func private @materializeResource0() -> !stream.resource<*>
-func.func private @materializeResource1(!stream.resource<*>) -> !stream.resource<*>
+util.func private @materializeResource0() -> !stream.resource<*>
+util.func private @materializeResource1(!stream.resource<*>) -> !stream.resource<*>
 
 // CHECK-LABEL: @GroupAwaitsByTimepointUnsafe
-func.func @GroupAwaitsByTimepointUnsafe(
+util.func private @GroupAwaitsByTimepointUnsafe(
   %arg0: !stream.timepoint
 ) -> (!stream.resource<*>, !stream.resource<*>) {
   %c100 = arith.constant 100 : index
   %c101 = arith.constant 101 : index
-  // CHECK: call @materializeResource0
-  %r0a = call @materializeResource0() : () -> !stream.resource<*>
+  // CHECK: util.call @materializeResource0
+  %r0a = util.call @materializeResource0() : () -> !stream.resource<*>
   // CHECK-NEXT: stream.timepoint.await
   %r0b = stream.timepoint.await %arg0 => %r0a : !stream.resource<*>{%c100}
-  // CHECK-NEXT: call @materializeResource1
-  %r1a = call @materializeResource1(%r0b) : (!stream.resource<*>) -> !stream.resource<*>
+  // CHECK-NEXT: util.call @materializeResource1
+  %r1a = util.call @materializeResource1(%r0b) : (!stream.resource<*>) -> !stream.resource<*>
   // CHECK-NEXT: stream.timepoint.await
   %r1b = stream.timepoint.await %arg0 => %r1a : !stream.resource<*>{%c101}
-  // CHECK-NEXT: return
-  return %r0b, %r1b : !stream.resource<*>, !stream.resource<*>
+  // CHECK-NEXT: util.return
+  util.return %r0b, %r1b : !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
@@ -257,10 +257,10 @@
 // Tests that the pattern doesn't kick in when the same timepoint are waited in
 // different blocks.
 
-func.func private @materializeResource() -> !stream.resource<*>
+util.func private @materializeResource() -> !stream.resource<*>
 
 // CHECK-LABEL: @DontGroupAwaitsByTimepointAcrossBlocks
-func.func @DontGroupAwaitsByTimepointAcrossBlocks(
+util.func private @DontGroupAwaitsByTimepointAcrossBlocks(
   %arg0: !stream.timepoint,
   %arg1: !stream.resource<*>,
   %arg2: i1
@@ -273,20 +273,20 @@
 // CHECK: ^bb
 ^bb0:
   // CHECK: stream.timepoint.await %arg0 => %arg1
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 // CHECK: ^bb
 ^bb1:
-  // CHECK: %[[R:.+]] = call @materializeResource
-  %r = call @materializeResource() : () -> !stream.resource<*>
+  // CHECK: %[[R:.+]] = util.call @materializeResource
+  %r = util.call @materializeResource() : () -> !stream.resource<*>
   // CHECK: stream.timepoint.await %arg0 => %[[R]]
   %1 = stream.timepoint.await %arg0 => %r : !stream.resource<*>{%c101}
-  return %1 : !stream.resource<*>
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @FoldDuplicateAwaitResources
-func.func @FoldDuplicateAwaitResources(
+util.func private @FoldDuplicateAwaitResources(
   %arg0: !stream.timepoint,
   %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>
 ) -> (!stream.resource<staging>, !stream.resource<*>, !stream.resource<staging>, !stream.resource<staging>) {
@@ -294,14 +294,14 @@
   %c200 = arith.constant 200 : index
   // CHECK: %[[RET:.+]]:2 = stream.timepoint.await %arg0 => %arg1, %arg2 : !stream.resource<staging>{%c100}, !stream.resource<*>{%c200}
   %0:4 = stream.timepoint.await %arg0 => %arg1, %arg2, %arg1, %arg1 : !stream.resource<staging>{%c100}, !stream.resource<*>{%c200}, !stream.resource<staging>{%c100}, !stream.resource<staging>{%c100}
-  // CHECK: return %[[RET]]#0, %[[RET]]#1, %[[RET]]#0, %[[RET]]#0
-  return %0#0, %0#1, %0#2, %0#3 : !stream.resource<staging>, !stream.resource<*>, !stream.resource<staging>, !stream.resource<staging>
+  // CHECK: util.return %[[RET]]#0, %[[RET]]#1, %[[RET]]#0, %[[RET]]#0
+  util.return %0#0, %0#1, %0#2, %0#3 : !stream.resource<staging>, !stream.resource<*>, !stream.resource<staging>, !stream.resource<staging>
 }
 
 // -----
 
 // CHECK-LABEL: @ElideUnusedTimepointAwaitOp
-func.func @ElideUnusedTimepointAwaitOp(
+util.func private @ElideUnusedTimepointAwaitOp(
   %arg0: !stream.timepoint,
   %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>
 ) {
@@ -309,5 +309,5 @@
   %c200 = arith.constant 200 : index
   // CHECK-NOT: stream.timepoint.await
   %0:2 = stream.timepoint.await %arg0 => %arg1, %arg2 : !stream.resource<staging>{%c100}, !stream.resource<*>{%c200}
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_ops.mlir
index 15c6d6b..ceb0994 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/timepoint_ops.mlir

@@ -1,55 +1,55 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @timepointImmediate
-func.func @timepointImmediate() -> !stream.timepoint {
+util.func private @timepointImmediate() -> !stream.timepoint {
   // CHECK: = stream.timepoint.immediate => !stream.timepoint
   %0 = stream.timepoint.immediate => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointImport
-func.func @timepointImport(%arg0: !hal.semaphore, %arg1: index) -> !stream.timepoint {
+util.func private @timepointImport(%arg0: !hal.semaphore, %arg1: index) -> !stream.timepoint {
   // CHECK: = stream.timepoint.import %arg0, %arg1 : (!hal.semaphore, index) => !stream.timepoint
   %0 = stream.timepoint.import %arg0, %arg1 : (!hal.semaphore, index) => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointExport
-func.func @timepointExport(%arg0: !stream.timepoint) -> (!hal.semaphore, index) {
+util.func private @timepointExport(%arg0: !stream.timepoint) -> (!hal.semaphore, index) {
   // CHECK: = stream.timepoint.export %arg0 => (!hal.semaphore, index)
   %0:2 = stream.timepoint.export %arg0 => (!hal.semaphore, index)
-  return %0#0, %0#1 : !hal.semaphore, index
+  util.return %0#0, %0#1 : !hal.semaphore, index
 }
 
 // -----
 
 // CHECK-LABEL: @timepointChainExternal
-func.func @timepointChainExternal(%arg0: !stream.timepoint, %arg1: !hal.fence) {
+util.func private @timepointChainExternal(%arg0: !stream.timepoint, %arg1: !hal.fence) {
   // CHECK: stream.timepoint.chain_external %arg0 => (%arg1 : !hal.fence)
   stream.timepoint.chain_external %arg0 => (%arg1 : !hal.fence)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @timepointJoin
-func.func @timepointJoin(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
+util.func private @timepointJoin(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
   // CHECK: = stream.timepoint.join max(%arg0, %arg1) => !stream.timepoint
   %0 = stream.timepoint.join max(%arg0, %arg1) => !stream.timepoint
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointAwait
-func.func @timepointAwait(%arg0: !stream.timepoint, %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>) -> (!stream.resource<staging>, !stream.resource<*>) {
+util.func private @timepointAwait(%arg0: !stream.timepoint, %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>) -> (!stream.resource<staging>, !stream.resource<*>) {
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   // CHECK: = stream.timepoint.await %arg0 => %arg1, %arg2 : !stream.resource<staging>{%c100}, !stream.resource<*>{%c200}
   %0:2 = stream.timepoint.await %arg0 => %arg1, %arg2 : !stream.resource<staging>{%c100}, !stream.resource<*>{%c200}
-  return %0#0, %0#1 : !stream.resource<staging>, !stream.resource<*>
+  util.return %0#0, %0#1 : !stream.resource<staging>, !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ElideTimepoints.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ElideTimepoints.cpp
index 12c9f6d..7138212 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ElideTimepoints.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ElideTimepoints.cpp

@@ -21,7 +21,6 @@
 #include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Attributes.h"
@@ -870,7 +869,7 @@
         })
         .Case<cf::BranchOp, cf::CondBranchOp>(
             [&](Operation *op) { elideTimepointOperands(op); })
-        .Case<func::ReturnOp, scf::YieldOp>(
+        .Case<IREE::Util::ReturnOp, scf::YieldOp>(
             [&](Operation *op) { elideTimepointOperands(op); });
   });
 

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.td
index fabe47f..83f1d0f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.td

@@ -262,7 +262,6 @@
   }];
   let dependentDialects = [
     "mlir::cf::ControlFlowDialect",
-    "mlir::func::FuncDialect",
     "IREE::Stream::StreamDialect",
     "IREE::Util::UtilDialect",
   ];

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/PropagateTimepoints.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/PropagateTimepoints.cpp
index 54b949d..1e0a667 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/PropagateTimepoints.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/PropagateTimepoints.cpp

@@ -18,7 +18,6 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -57,7 +56,8 @@
 // as duplicates will get added and we'll need to rely on global fusion to
 // get rid of them. Note that this only expands globals and does not yet update
 // use sites - we just need the ops to reference while doing so.
-static ExpandedGlobalMap expandResourceGlobals(Operation *rootOp) {
+static ExpandedGlobalMap expandResourceGlobals(Operation *rootOp,
+                                               SymbolTable &symbolTable) {
   ExpandedGlobalMap expandedGlobals;
 
   // Gather all of the resource globals in the root.
@@ -70,7 +70,6 @@
   }
 
   // Expand each global by adding the timepoint right next to it.
-  SymbolTable symbolTable(rootOp);
   auto timepointType = IREE::Stream::TimepointType::get(rootOp->getContext());
   auto immediateAttr =
       IREE::Stream::TimepointAttr::get(rootOp->getContext(), timepointType);
@@ -103,20 +102,22 @@
          llvm::any_of(op->getResultTypes(), isResourceType);
 }
 
+static void expandType(Type type, SmallVectorImpl<Type> &newTypes) {
+  newTypes.push_back(type);
+  if (isResourceType(type)) {
+    newTypes.push_back(IREE::Stream::TimepointType::get(type.getContext()));
+  }
+}
+
 // Expands resources in the given |types| list to (timepoint, resource).
 // This could be changed to some iterator magic to avoid the alloc.
 static SmallVector<Type> expandTypes(TypeRange types) {
   if (types.empty())
     return {};
-  auto timepointType =
-      IREE::Stream::TimepointType::get(types.front().getContext());
   SmallVector<Type> newTypes;
   newTypes.reserve(types.size() * 2);
   for (auto type : types) {
-    if (isResourceType(type)) {
-      newTypes.push_back(timepointType);
-    }
-    newTypes.push_back(type);
+    expandType(type, newTypes);
   }
   return newTypes;
 }
@@ -150,6 +151,19 @@
   }
 }
 
+static void expandOperand(Location loc, Value operand,
+                          SmallVectorImpl<Value> &newOperands,
+                          IRMapping &resourceTimepointMap, OpBuilder &builder) {
+  if (isResourceType(operand.getType())) {
+    auto [timepoint, resource] =
+        consumeTimepoint(loc, operand, resourceTimepointMap, builder);
+    newOperands.push_back(resource);
+    newOperands.push_back(timepoint);
+  } else {
+    newOperands.push_back(operand);
+  }
+}
+
 // Expands resources in |operands| into (timepoint, resource) pairs.
 static SmallVector<Value> expandOperands(Location loc, ValueRange operands,
                                          IRMapping &resourceTimepointMap,
@@ -157,19 +171,13 @@
   SmallVector<Value> result;
   result.reserve(operands.size() * 2);
   for (auto operand : operands) {
-    if (isResourceType(operand.getType())) {
-      auto timepointOperand =
-          consumeTimepoint(loc, operand, resourceTimepointMap, builder);
-      result.push_back(timepointOperand.first);
-      result.push_back(timepointOperand.second);
-    } else {
-      result.push_back(operand);
-    }
+    expandOperand(loc, operand, result, resourceTimepointMap, builder);
   }
   return result;
 }
 
-static void expandTimepoints(Operation *op, ExpandedGlobalMap &globalMap,
+static void expandTimepoints(Operation *op, SymbolTable &symbolTable,
+                             ExpandedGlobalMap &globalMap,
                              IRMapping &resourceTimepointMap);
 
 // Finds the size of a block argument resource or materializes a size if needed.
@@ -222,7 +230,7 @@
 // Recursively expands resources into (timepoint, resource) pairs within the
 // given |region|. All branches, ops, and nested regions will be processed.
 static void expandRegion(Region &region, bool canModifyEntryBlock,
-                         ExpandedGlobalMap &globalMap,
+                         SymbolTable &symbolTable, ExpandedGlobalMap &globalMap,
                          IRMapping resourceTimepointMap) {
   if (region.empty())
     return;
@@ -242,20 +250,18 @@
       if (!isResourceType(resourceArg.getType()))
         continue;
       auto timepointArg =
-          block.insertArgument(i, timepointType, resourceArg.getLoc());
+          block.insertArgument(i + 1, timepointType, resourceArg.getLoc());
       expansions.push_back(std::make_pair(timepointArg, resourceArg));
       resourceTimepointMap.map(resourceArg, timepointArg);
     }
 
     // Insert awaits that we've sunk from callers.
     auto builder = OpBuilder::atBlockBegin(&block);
-    for (auto expansion : llvm::reverse(expansions)) {
+    for (auto [timepoint, resource] : llvm::reverse(expansions)) {
       // If we can look down the chain and see the size then we can use that.
       // If it's a constant we can't use it as it may be defined anywhere in the
       // region. Dynamic dimensions usually come from outside or entry arguments
       // though and those are available.
-      auto timepoint = expansion.first;
-      auto resource = expansion.second;
       auto resourceSize =
           makeBlockArgResourceSize(region.getLoc(), resource, builder);
       auto awaitOp = builder.create<IREE::Stream::TimepointAwaitOp>(
@@ -276,14 +282,14 @@
   if (region.hasOneBlock()) {
     for (auto &op :
          llvm::make_early_inc_range(region.front().getOperations())) {
-      expandTimepoints(&op, globalMap, resourceTimepointMap);
+      expandTimepoints(&op, symbolTable, globalMap, resourceTimepointMap);
     }
   } else {
     DominanceInfo domInfo(region.getParentOp());
     for (auto *blockInfo : llvm::breadth_first(domInfo.getRootNode(&region))) {
       auto *block = blockInfo->getBlock();
       for (auto &op : llvm::make_early_inc_range(block->getOperations())) {
-        expandTimepoints(&op, globalMap, resourceTimepointMap);
+        expandTimepoints(&op, symbolTable, globalMap, resourceTimepointMap);
       }
     }
   }
@@ -357,23 +363,11 @@
 }
 
 static void expandInitializerOp(IREE::Util::InitializerOp op,
+                                SymbolTable &symbolTable,
                                 ExpandedGlobalMap &globalMap,
                                 IRMapping &resourceTimepointMap) {
-  expandRegion(op.getRegion(), /*canModifyEntryBlock=*/false, globalMap,
-               resourceTimepointMap);
-}
-
-// Returns true if |op| is either public and visible to external modules or
-// external and resolved later on. We can't modify their signatures.
-static bool isPublicOrExternal(CallableOpInterface callableOp) {
-  if (auto symbolOp = dyn_cast<SymbolOpInterface>(callableOp.getOperation())) {
-    if (symbolOp.isPublic())
-      return true;
-  }
-  auto *region = callableOp.getCallableRegion();
-  if (!region || region->empty())
-    return true;
-  return false;
+  expandRegion(op.getRegion(), /*canModifyEntryBlock=*/false, symbolTable,
+               globalMap, resourceTimepointMap);
 }
 
 // Inserts awaits on resource arguments.
@@ -384,24 +378,25 @@
 // don't need a wait.
 //
 // Example:
-//  func.func @foo(%0: !stream.resource)
+//  util.func @foo(%0: !stream.resource)
 //  ->
-//  func.func @foo(%t: !stream.timepoint, %0: !stream.resource) {
+//  util.func @foo(%t: !stream.timepoint, %0: !stream.resource) {
 //    %1 = stream.timepoint.await %t, %0
-static void expandFuncOp(mlir::func::FuncOp op, ExpandedGlobalMap &globalMap,
+static void expandFuncOp(IREE::Util::FuncOp op, SymbolTable &symbolTable,
+                         ExpandedGlobalMap &globalMap,
                          IRMapping &resourceTimepointMap) {
   // Ignore public/external function signatures but still convert regions.
-  bool canModifyEntryBlock = !isPublicOrExternal(op);
+  bool canModifyEntryBlock = !IREE::Util::isPublicOrExternal(op);
   if (canModifyEntryBlock) {
-    auto oldType = op.getFunctionType();
-    auto inputTypes = expandTypes(oldType.getInputs());
-    auto resultTypes = expandTypes(oldType.getResults());
-    auto newType = FunctionType::get(op.getContext(), inputTypes, resultTypes);
-    if (newType != oldType) {
-      op.setType(newType);
-    }
+    op.expandSignature(
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          expandType(type, newTypes);
+        },
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          expandType(type, newTypes);
+        });
   }
-  expandRegion(op.getRegion(), canModifyEntryBlock, globalMap,
+  expandRegion(op.getRegion(), canModifyEntryBlock, symbolTable, globalMap,
                resourceTimepointMap);
 }
 
@@ -414,28 +409,31 @@
 //
 // Example:
 //  %1 = stream.timepoint.await %t, %0
-//  %r = call @foo(%1)
+//  %r = util.call @foo(%1)
 //  ->
-//  %rt, %r = call @foo(%t, %0)
+//  %rt, %r = util.call @foo(%t, %0)
 //  stream.timepoint.await %rt, %t
-static void expandCallOp(mlir::func::CallOp op,
+static void expandCallOp(IREE::Util::CallOp op, SymbolTable &symbolTable,
                          IRMapping &resourceTimepointMap) {
   if (!usesResources(op))
     return;
 
   // Ignore calls to public/external functions.
-  auto calleeOp = SymbolTable::lookupNearestSymbolFrom<CallableOpInterface>(
-      op, op.getCalleeAttr());
-  if (isPublicOrExternal(calleeOp))
+  auto calleeOp = symbolTable.lookup<CallableOpInterface>(op.getCallee());
+  if (IREE::Util::isPublicOrExternal(calleeOp))
     return;
 
   // Build the new call op with expanded operands and results.
   OpBuilder builder(op);
-  auto operands = expandOperands(op.getLoc(), op.getOperands(),
-                                 resourceTimepointMap, builder);
-  auto resultTypes = expandTypes(op.getResultTypes());
-  auto newOp = builder.create<mlir::func::CallOp>(op.getLoc(), op.getCallee(),
-                                                  resultTypes, operands);
+  auto newOp = op.cloneAndExpand(
+      [&](unsigned i, Value operand, SmallVectorImpl<Value> &newOperands) {
+        expandOperand(op.getLoc(), operand, newOperands, resourceTimepointMap,
+                      builder);
+      },
+      [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+        expandType(type, newTypes);
+      },
+      builder);
 
   // Insert awaits on results that we are sinking across the call edge.
   // The hope is that by moving the awaits here we can fold with uses inside
@@ -449,8 +447,8 @@
       oldResult.replaceAllUsesWith(newResult);
       continue;
     }
-    auto newTimepoint = newOp.getResult(newIdx++);
     auto newResult = newOp.getResult(newIdx++);
+    auto newTimepoint = newOp.getResult(newIdx++);
     resourceTimepointMap.map(newResult, newTimepoint);
     auto newResultSize =
         builder.create<IREE::Stream::ResourceSizeOp>(op.getLoc(), newResult)
@@ -469,19 +467,19 @@
 //
 // Example:
 //  %1 = stream.timepoint.await %t, %0
-//  return %1
+//  util.return %1
 //  ->
-//  return %t, %0
-static void expandReturnOp(mlir::func::ReturnOp op,
+//  util.return %t, %0
+static void expandReturnOp(IREE::Util::ReturnOp op,
                            IRMapping &resourceTimepointMap) {
   if (!usesResources(op))
     return;
-  if (isPublicOrExternal(op->getParentOfType<mlir::func::FuncOp>()))
+  if (IREE::Util::isPublicOrExternal(op->getParentOfType<IREE::Util::FuncOp>()))
     return;
   OpBuilder builder(op);
   auto operands = expandOperands(op.getLoc(), op.getOperands(),
                                  resourceTimepointMap, builder);
-  builder.create<mlir::func::ReturnOp>(op.getLoc(), operands);
+  builder.create<IREE::Util::ReturnOp>(op.getLoc(), operands);
   op.erase();
 }
 
@@ -491,11 +489,11 @@
 // Example:
 //    %1 = stream.timepoint.await %t, %0
 //    br ^bb1(%1)
-//  ^bb1(%b):
+//  ^bb1(%bb_1):
 //  ->
-//    br ^bb1(%t, %0)
-//  ^bb1(%a, %b):
-//    %1 = stream.timepoint.await %a, %b
+//    br ^bb1(%0, %t)
+//  ^bb1(%bb_0, %bb_t):
+//    %1 = stream.timepoint.await %bb_t, %bb_0
 static void expandBranchOp(mlir::cf::BranchOp op,
                            IRMapping &resourceTimepointMap) {
   if (!usesResources(op))
@@ -598,19 +596,21 @@
 // Recursively expands resources into (timepoint, resource) in |op|.
 // Resource timepoint chains are established when possible by looking through
 // awaits.
-static void expandTimepoints(Operation *op, ExpandedGlobalMap &globalMap,
+static void expandTimepoints(Operation *op, SymbolTable &symbolTable,
+                             ExpandedGlobalMap &globalMap,
                              IRMapping &resourceTimepointMap) {
   if (auto loadOp = dyn_cast<IREE::Util::GlobalLoadOpInterface>(op)) {
     expandGlobalLoadOp(loadOp, globalMap, resourceTimepointMap);
   } else if (auto storeOp = dyn_cast<IREE::Util::GlobalStoreOpInterface>(op)) {
     expandGlobalStoreOp(storeOp, globalMap, resourceTimepointMap);
   } else if (auto initializerOp = dyn_cast<IREE::Util::InitializerOp>(op)) {
-    expandInitializerOp(initializerOp, globalMap, resourceTimepointMap);
-  } else if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
-    expandFuncOp(funcOp, globalMap, resourceTimepointMap);
-  } else if (auto callOp = dyn_cast<mlir::func::CallOp>(op)) {
-    expandCallOp(callOp, resourceTimepointMap);
-  } else if (auto returnOp = dyn_cast<mlir::func::ReturnOp>(op)) {
+    expandInitializerOp(initializerOp, symbolTable, globalMap,
+                        resourceTimepointMap);
+  } else if (auto funcOp = dyn_cast<IREE::Util::FuncOp>(op)) {
+    expandFuncOp(funcOp, symbolTable, globalMap, resourceTimepointMap);
+  } else if (auto callOp = dyn_cast<IREE::Util::CallOp>(op)) {
+    expandCallOp(callOp, symbolTable, resourceTimepointMap);
+  } else if (auto returnOp = dyn_cast<IREE::Util::ReturnOp>(op)) {
     expandReturnOp(returnOp, resourceTimepointMap);
   } else if (auto branchOp = dyn_cast<mlir::cf::BranchOp>(op)) {
     expandBranchOp(branchOp, resourceTimepointMap);
@@ -644,9 +644,10 @@
           PropagateTimepointsPass> {
   void runOnOperation() override {
     auto rootOp = getOperation();
+    SymbolTable symbolTable(rootOp);
 
     // Expand all util.global ops holding resources into (timepoint, resource).
-    auto globalMap = expandResourceGlobals(rootOp);
+    auto globalMap = expandResourceGlobals(rootOp, symbolTable);
 
     // Walk the entire IR tree and expand the globals.
     // We could do this via pattern application but that gets much trickier to
@@ -654,7 +655,8 @@
     // expanding multiple times.
     for (auto callableOp : rootOp.getOps<mlir::CallableOpInterface>()) {
       IRMapping resourceTimepointMap;
-      expandTimepoints(callableOp, globalMap, resourceTimepointMap);
+      expandTimepoints(callableOp, symbolTable, globalMap,
+                       resourceTimepointMap);
     }
   }
 };

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/RefineUsage.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/RefineUsage.cpp
index abb4cce..bc73616 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/RefineUsage.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/RefineUsage.cpp

@@ -15,7 +15,6 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -224,9 +223,9 @@
 // Applies usage analysis results to an MLIR function.
 // All resource arguments and results, block arguments, and nested operations
 // will have their lifetime specified.
-struct ApplyFuncOp : public UsageRefinementPattern<mlir::func::FuncOp> {
-  using UsageRefinementPattern<mlir::func::FuncOp>::UsageRefinementPattern;
-  LogicalResult matchAndRewrite(mlir::func::FuncOp op,
+struct ApplyFuncOp : public UsageRefinementPattern<IREE::Util::FuncOp> {
+  using UsageRefinementPattern<IREE::Util::FuncOp>::UsageRefinementPattern;
+  LogicalResult matchAndRewrite(IREE::Util::FuncOp op,
                                 PatternRewriter &rewriter) const override {
     if (op.isExternal()) {
       return rewriter.notifyMatchFailure(op, "external funcs not supported");
@@ -255,7 +254,7 @@
 
     // Results:
     SmallVector<Type> newOutputs;
-    auto anyReturnOp = *op.getOps<mlir::func::ReturnOp>().begin();
+    auto anyReturnOp = *op.getOps<IREE::Util::ReturnOp>().begin();
     for (auto outputType : llvm::enumerate(op.getFunctionType().getResults())) {
       auto oldType =
           llvm::dyn_cast<IREE::Stream::ResourceType>(outputType.value());
@@ -410,7 +409,7 @@
                   ApplyScfWhileOp>(context, analysis);
   patterns.insert<ApplyGenericOp<IREE::Util::OptimizationBarrierOp>,
                   ApplyGenericOp<mlir::arith::SelectOp>,
-                  ApplyGenericOp<mlir::func::CallOp>,
+                  ApplyGenericOp<IREE::Util::CallOp>,
                   ApplyGenericOp<mlir::scf::ConditionOp>,
                   ApplyGenericOp<mlir::scf::YieldOp>,
                   ApplyGenericOp<IREE::Stream::TimepointBarrierOp>>(context,

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/annotate_dispatch_arguments.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/annotate_dispatch_arguments.mlir
index feb6913..69a16bc 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/annotate_dispatch_arguments.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/annotate_dispatch_arguments.mlir

@@ -7,7 +7,7 @@
   // CHECK: stream.executable.export public @dispatch
   stream.executable.export public @dispatch
 }
-func.func @skipExternExecutables(%arg0: i32) {
+util.func public @skipExternExecutables(%arg0: i32) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c0_i32 = arith.constant 0 : i32
@@ -17,7 +17,7 @@
       rw %capture[%c0 for %c1] : !stream.resource<transient>{%c1}
     }
   } => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
@@ -32,17 +32,17 @@
 stream.executable private @annotatePotentialValuesEx {
   stream.executable.export public @dispatch
   builtin.module  {
-    // CHECK: func.func @dispatch(
+    // CHECK:  util.func public @dispatch(
     // CHECK-SAME: %arg0: i32,
     // CHECK-SAME: %arg1: index {stream.alignment = 4 : index, stream.values = [20 : index, 40 : index]},
     // CHECK-SAME: %arg2: i1 {stream.values = [false, true]},
     // CHECK-SAME: %arg3: f32
-    func.func @dispatch(%arg0: i32, %arg1: index, %arg2: i1, %arg3: f32, %binding: !stream.binding) {
-      return
+     util.func public @dispatch(%arg0: i32, %arg1: index, %arg2: i1, %arg3: f32, %binding: !stream.binding) {
+      util.return
     }
   }
 }
-func.func @annotatePotentialValues(%arg0: i32) {
+util.func public @annotatePotentialValues(%arg0: i32) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c0_i32 = arith.constant 0 : i32
@@ -61,7 +61,7 @@
       rw %capture[%c0 for %c1] : !stream.resource<transient>{%c1}
     }
   } => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
@@ -77,24 +77,24 @@
 stream.executable private @annotateOperandAlignmentEx {
   stream.executable.export public @dispatch
   builtin.module  {
-    // CHECK: func.func @dispatch(
+    // CHECK:  util.func public @dispatch(
     // CHECK-SAME: %arg0: index,
     // CHECK-SAME: %arg1: index {stream.alignment = 16 : index},
     // CHECK-SAME: %arg2: index {stream.values = [4096 : index, 4097 : index]},
     // CHECK-SAME: %arg3: index {stream.alignment = 16 : index, stream.values = [1200 : index, 5232 : index]}
     // CHECK-SAME: %arg4: index {stream.alignment = 1024 : index, stream.values = [1024 : index, 2048 : index]}
-    func.func @dispatch(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %binding: !stream.binding) {
-      return
+     util.func public @dispatch(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %binding: !stream.binding) {
+      util.return
     }
   }
 }
 util.global private mutable @global_var = 1024 : index
-func.func @otherFunc() {
+util.func public @otherFunc() {
   %c2048 = arith.constant 2048 : index
   util.global.store %c2048, @global_var : index
-  return
+  util.return
 }
-func.func @annotateOperandAlignment(%arg0: index, %arg1: index) {
+util.func public @annotateOperandAlignment(%arg0: index, %arg1: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c16 = arith.constant 16 : index
@@ -114,7 +114,7 @@
       rw %capture[%c0 for %c1] : !stream.resource<transient>{%c1}
     }
   } => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
@@ -129,17 +129,17 @@
 stream.executable private @annotateBindingAlignmentEx {
   stream.executable.export public @dispatch
   builtin.module  {
-    // CHECK: func.func @dispatch(
+    // CHECK:  util.func public @dispatch(
     // CHECK-SAME: %arg0: !stream.binding {stream.alignment = 64 : index},
     // CHECK-SAME: %arg1: !stream.binding,
     // CHECK-SAME: %arg2: !stream.binding {stream.alignment = 8 : index},
     // CHECK-SAME: %arg3: !stream.binding {stream.alignment = 16 : index})
-    func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
-      return
+     util.func public @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: !stream.binding) {
+      util.return
     }
   }
 }
-func.func @annotateBindingAlignment(%arg0: index, %arg1: index) {
+util.func public @annotateBindingAlignment(%arg0: index, %arg1: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c8 = arith.constant 8 : index
@@ -163,5 +163,5 @@
       rw %capture[%aligned1 for %c8] : !stream.resource<transient>{%c64}
     }
   } => !stream.timepoint
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/convert_to_stream.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/convert_to_stream.mlir
index a353ac6..fd68d30 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/convert_to_stream.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/convert_to_stream.mlir

@@ -5,8 +5,8 @@
   // CHECK: stream.executable.export public @dispatch
   flow.executable.export public @dispatch
   builtin.module {
-    // CHECK: func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %[[ARG0_DIM0:.+]]: index, %[[ARG1_DIM1:.+]]: index)
-    func.func @dispatch(%arg0: !flow.dispatch.tensor<readonly:tensor<?x4xf32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<4x?xf32>>,
+    // CHECK:  util.func public @dispatch(%arg0: !stream.binding, %arg1: !stream.binding, %[[ARG0_DIM0:.+]]: index, %[[ARG1_DIM1:.+]]: index)
+     util.func public @dispatch(%arg0: !flow.dispatch.tensor<readonly:tensor<?x4xf32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<4x?xf32>>,
                    %arg0_dim0: index, %arg1_dim1: index) {
       // CHECK: %[[ARG0_TENSOR:.+]] = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x4xf32>>{%[[ARG0_DIM0]]}
       %arg0_tied = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor<readonly:tensor<?x4xf32>>{%arg0_dim0}
@@ -18,13 +18,13 @@
       // CHECK: flow.dispatch.tensor.store %[[TILE]], %[[ARG1_TENSOR]], offsets = [0, 0], sizes = [%[[ARG0_DIM0]], 4], strides = [1, 1] : tensor<?x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x?xf32>>{%[[ARG1_DIM1]]}
       flow.dispatch.tensor.store %0, %arg1_tied, offsets = [0, 0], sizes = [%arg0_dim0, 4], strides = [1, 1] : tensor<?x4xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x?xf32>>{%arg1_dim1}
 
-      return
+      util.return
     }
   }
 }
 
 // CHECK-LABEL: @simple_mul
-func.func @simple_mul(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
+util.func public @simple_mul(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
   // CHECK-DAG: %[[DIM0:.+]] = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
   %dim0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
   // CHECK-DAG: %[[ELEMENT_TYPE:.+]] = hal.element_type<f32>
@@ -44,8 +44,8 @@
   // CHECK: %[[RET0_T:.+]] = stream.async.transfer %[[RET0]] : !stream.resource<*>{%[[RET0_SIZE]]} -> !stream.resource<external>{%[[RET0_SIZE]]}
   // CHECK: %[[RET0_EXPORT:.+]] = stream.tensor.export %[[RET0_T]] : tensor<?xf32>{%[[DIM0]]} in !stream.resource<external>{%[[RET0_SIZE]]} -> !hal.buffer_view
   %2 = hal.tensor.export %1 : tensor<?xf32>{%dim0} -> !hal.buffer_view
-  // CHECK: return %[[RET0_EXPORT]] : !hal.buffer_view
-  return %2 : !hal.buffer_view
+  // CHECK: util.return %[[RET0_EXPORT]] : !hal.buffer_view
+  util.return %2 : !hal.buffer_view
 }
 
 // -----
@@ -54,7 +54,7 @@
 
 // CHECK-LABEL: @custom_ops
 // CHECK-SAME: (%[[ARG:.+]]: !stream.resource<*>, %[[ARG_SIZE:.+]]: index) -> (!stream.resource<*>, index)
-func.func @custom_ops(%arg0: tensor<4x8xf32>) -> tensor<8x4xf32> {
+util.func public @custom_ops(%arg0: tensor<4x8xf32>) -> tensor<8x4xf32> {
   // CHECK: %[[ARG_EXTERNAL:.+]] = stream.async.transfer %[[ARG]]
   // CHECK: %[[ARG_TENSOR:.+]] = stream.tensor.export %[[ARG_EXTERNAL]]
   // CHECK: %[[RET_TENSOR:.+]] = "some.op"(%[[ARG_TENSOR]]) : (tensor<4x8xf32>) -> tensor<8x4xf32>
@@ -62,8 +62,8 @@
   // CHECK: %[[RET_SIZE:.+]] = stream.tensor.sizeof tensor<8x4xf32>
   // CHECK: %[[RET_EXTERNAL:.+]] = stream.tensor.import %[[RET_TENSOR]]
   // CHECK: %[[RET:.+]] = stream.async.transfer %[[RET_EXTERNAL]]
-  // CHECK: return %[[RET]], %[[RET_SIZE]] : !stream.resource<*>, index
-  return %0 : tensor<8x4xf32>
+  // CHECK: util.return %[[RET]], %[[RET_SIZE]] : !stream.resource<*>, index
+  util.return %0 : tensor<8x4xf32>
 }
 
 // -----
@@ -79,8 +79,8 @@
   flow.executable.export public @dispatch
   // CHECK: builtin.module
   builtin.module {
-    // CHECK: func.func @dispatch(%[[BINDING0:.+]]: !stream.binding, %[[BINDING1:.+]]: !stream.binding)
-    func.func @dispatch(%arg0: !flow.dispatch.tensor<readonly:tensor<i32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<i1>>) {
+    // CHECK:  util.func public @dispatch(%[[BINDING0:.+]]: !stream.binding, %[[BINDING1:.+]]: !stream.binding)
+     util.func public @dispatch(%arg0: !flow.dispatch.tensor<readonly:tensor<i32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<i1>>) {
       %c3_i32 = arith.constant 3 : i32
       // CHECK: %[[ARG0:.+]] = stream.binding.subspan %[[BINDING0]][%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<i32>>
       // CHECK: %[[ARG1:.+]] = stream.binding.subspan %[[BINDING1]][%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<i1>>
@@ -95,7 +95,7 @@
       } -> tensor<i1>
       // CHECK: flow.dispatch.tensor.store %{{.+}}, %[[ARG1]], offsets = [], sizes = [], strides = [] : tensor<i1> -> !flow.dispatch.tensor<writeonly:tensor<i1>>
       flow.dispatch.tensor.store %2, %arg1, offsets = [], sizes = [], strides = [] : tensor<i1> -> !flow.dispatch.tensor<writeonly:tensor<i1>>
-      return
+      util.return
     }
   }
 }
@@ -104,7 +104,7 @@
 flow.executable private @while_test_dispatch_1 {
   flow.executable.export public @dispatch
   builtin.module  {
-    func.func @dispatch(%arg0: !flow.dispatch.tensor<readonly:tensor<i32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<i32>>) {
+     util.func public @dispatch(%arg0: !flow.dispatch.tensor<readonly:tensor<i32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<i32>>) {
       %c2_i32 = arith.constant 2 : i32
       %0 = flow.dispatch.tensor.load %arg0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<i32>> -> tensor<i32>
       %1 = tensor.empty() : tensor<i32>
@@ -114,13 +114,13 @@
         linalg.yield %3 : i32
       } -> tensor<i32>
       flow.dispatch.tensor.store %2, %arg1, offsets = [], sizes = [], strides = [] : tensor<i32> -> !flow.dispatch.tensor<writeonly:tensor<i32>>
-      return
+      util.return
     }
   }
 }
 
-// CHECK-LABEL: func.func @while_test
-func.func @while_test() {
+// CHECK-LABEL:  util.func public @while_test
+util.func public @while_test() {
   %c1 = arith.constant 1 : index
 
   // CHECK: %[[CONSTANT:.+]] = stream.tensor.constant : tensor<i32> in !stream.resource<constant> = dense<4> : tensor<i32>
@@ -164,7 +164,7 @@
   // CHECK: %[[TENSOR_CONSTANT:.+]] = stream.tensor.export %[[EXTERNAL_CONSTANT]] : tensor<i32> in !stream.resource<external>{%[[CONSTANT_SIZE]]} -> tensor<i32>
   // CHECK: check.expect_eq(%[[TENSOR_RESULT]], %[[TENSOR_CONSTANT]]) : tensor<i32>
   check.expect_eq(%1, %cst) : tensor<i32>
-  return
+  util.return
 }
 
 // -----
@@ -174,10 +174,10 @@
 
 // CHECK-LABEL: unrealizedCastCleanup
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[LHS:.+]]: !stream.resource<*>, %[[LHS_SIZE:.+]]: index, %[[RHS:.+]]: !stream.resource<*>, %[[RHS_SIZE:.+]]: index) -> (!stream.resource<*>, index)
-func.func @unrealizedCastCleanup(%cond: i1, %lhs: tensor<1024xf32>, %rhs: tensor<1024xf32>) -> tensor<1024xf32> {
+util.func public @unrealizedCastCleanup(%cond: i1, %lhs: tensor<1024xf32>, %rhs: tensor<1024xf32>) -> tensor<1024xf32> {
   // CHECK-DAG: %[[RET:.+]] = arith.select %[[COND]], %[[LHS]], %[[RHS]] : !stream.resource<*>
   // CHECK-DAG: %[[RET_SIZE:.+]] = arith.select %[[COND]], %[[LHS_SIZE]], %[[RHS_SIZE]] : index
   %0 = arith.select %cond, %lhs, %rhs : tensor<1024xf32>
-  // CHECK: return %[[RET]], %[[RET_SIZE]]
-  return %0 : tensor<1024xf32>
+  // CHECK: util.return %[[RET]], %[[RET_SIZE]]
+  util.return %0 : tensor<1024xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/dump_statistics.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/dump_statistics.mlir
index 1e960f8..1507326 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/dump_statistics.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/dump_statistics.mlir

@@ -49,7 +49,7 @@
 stream.executable private @func_a_ex_0 {
   stream.executable.export public @dispatch_0
   builtin.module {
-    func.func @dispatch_0(%arg0: !stream.binding {stream.alignment = 32 : index}, %arg1: !stream.binding {stream.alignment = 32 : index}, %arg2: !stream.binding {stream.alignment = 32 : index}) {
+     util.func public @dispatch_0(%arg0: !stream.binding {stream.alignment = 32 : index}, %arg1: !stream.binding {stream.alignment = 32 : index}, %arg2: !stream.binding {stream.alignment = 32 : index}) {
       %c4 = arith.constant 4 : index
       %c0 = arith.constant 0 : index
       %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xi32>>
@@ -72,7 +72,7 @@
         } -> tensor<?xi32>
         flow.dispatch.tensor.store %9, %2, offsets = [%arg3], sizes = [%5], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:tensor<4xi32>>
       }
-      return
+      util.return
     }
   }
 }
@@ -80,7 +80,7 @@
 stream.executable private @func_a_ex_1 {
   stream.executable.export public @dispatch_1
   builtin.module {
-    func.func @dispatch_1(%arg0: !stream.binding {stream.alignment = 32 : index}, %arg1: !stream.binding {stream.alignment = 32 : index}, %arg2: !stream.binding {stream.alignment = 32 : index}) {
+     util.func public @dispatch_1(%arg0: !stream.binding {stream.alignment = 32 : index}, %arg1: !stream.binding {stream.alignment = 32 : index}, %arg2: !stream.binding {stream.alignment = 32 : index}) {
       %c3 = arith.constant 3 : index
       %c0 = arith.constant 0 : index
       %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<3xi32>>
@@ -103,12 +103,12 @@
         } -> tensor<?xi32>
         flow.dispatch.tensor.store %9, %2, offsets = [%arg3], sizes = [%5], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:tensor<3xi32>>
       }
-      return
+      util.return
     }
   }
 }
 
-func.func public @func_a() -> (tensor<4xi32>, tensor<4xi32>) {
+util.func public @func_a() -> (tensor<4xi32>, tensor<4xi32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
@@ -144,5 +144,5 @@
   %5 = stream.tensor.export %4 : tensor<4xi32> in !stream.resource<external>{%c16} -> tensor<4xi32>
   %6 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c16}
   %7 = stream.tensor.export %6 : tensor<4xi32> in !stream.resource<external>{%c16} -> tensor<4xi32>
-  return %5, %7 : tensor<4xi32>, tensor<4xi32>
+  util.return %5, %7 : tensor<4xi32>, tensor<4xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_async_copies.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_async_copies.mlir
index 9686177..c586855 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_async_copies.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_async_copies.mlir

@@ -5,7 +5,7 @@
 // expects us to clean up.
 
 // CHECK-LABEL: @multiUseTiedOperand
-func.func @multiUseTiedOperand(%size: index) -> (!stream.resource<*>, !stream.resource<*>) {
+util.func public @multiUseTiedOperand(%size: index) -> (!stream.resource<*>, !stream.resource<*>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -22,8 +22,8 @@
   %clone1 = stream.async.clone %splat : !stream.resource<*>{%size} -> !stream.resource<*>{%size}
   // CHECK: %[[FILL1:.+]] = stream.async.fill %c789_i32, %[[SPLAT]]
   %fill1 = stream.async.fill %c789_i32, %clone1[%c128 to %c256 for %c128] : i32 -> %3 as !stream.resource<*>{%size}
-  // CHECK: return %[[FILL0]], %[[FILL1]]
-  return %fill0, %fill1 : !stream.resource<*>, !stream.resource<*>
+  // CHECK: util.return %[[FILL0]], %[[FILL1]]
+  util.return %fill0, %fill1 : !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
@@ -34,7 +34,7 @@
 
 // CHECK-LABEL: @argMoveCallee
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<*>
-func.func private @argMoveCallee(%arg: !stream.resource<*>, %size: index) -> !stream.resource<*> {
+util.func private @argMoveCallee(%arg: !stream.resource<*>, %size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
@@ -42,16 +42,16 @@
   %clone = stream.async.clone %arg : !stream.resource<*>{%size} -> !stream.resource<*>{%size}
   // CHECK: %[[FILL:.+]] = stream.async.fill %c123_i32, %[[ARG0]]
   %fill = stream.async.fill %c123_i32, %clone[%c0 to %c128 for %c128] : i32 -> %0 as !stream.resource<*>{%size}
-  // CHECK: return %[[FILL]]
-  return %fill : !stream.resource<*>
+  // CHECK: util.return %[[FILL]]
+  util.return %fill : !stream.resource<*>
 }
 // CHECK: @argMoveCaller
-func.func @argMoveCaller(%size: index) -> !stream.resource<*> {
+util.func public @argMoveCaller(%size: index) -> !stream.resource<*> {
   %c123_i32 = arith.constant 123 : i32
   // CHECK: stream.async.splat
   %splat = stream.async.splat %c123_i32 : i32 -> !stream.resource<*>{%size}
-  %result = call @argMoveCallee(%splat, %size) : (!stream.resource<*>, index) -> !stream.resource<*>
-  return %result : !stream.resource<*>
+  %result = util.call @argMoveCallee(%splat, %size) : (!stream.resource<*>, index) -> !stream.resource<*>
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -60,7 +60,7 @@
 // call and passed by const-reference.
 
 // CHECK-LABEL: @argCopyCallee
-func.func private @argCopyCallee(%arg: !stream.resource<*>, %size: index) -> !stream.resource<*> {
+util.func private @argCopyCallee(%arg: !stream.resource<*>, %size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
@@ -68,15 +68,15 @@
   %clone = stream.async.clone %arg : !stream.resource<*>{%size} -> !stream.resource<*>{%size}
   // CHECK: stream.async.fill
   %fill = stream.async.fill %c123_i32, %clone[%c0 to %c128 for %c128] : i32 -> %0 as !stream.resource<*>{%size}
-  return %fill : !stream.resource<*>
+  util.return %fill : !stream.resource<*>
 }
 // CHECK: @argCopyCaller
-func.func @argCopyCaller(%size: index) -> (!stream.resource<*>, !stream.resource<*>) {
+util.func public @argCopyCaller(%size: index) -> (!stream.resource<*>, !stream.resource<*>) {
   %c123_i32 = arith.constant 123 : i32
   // CHECK: stream.async.splat
   %splat = stream.async.splat %c123_i32 : i32 -> !stream.resource<*>{%size}
-  %result = call @argCopyCallee(%splat, %size) : (!stream.resource<*>, index) -> !stream.resource<*>
-  return %splat, %result : !stream.resource<*>, !stream.resource<*>
+  %result = util.call @argCopyCallee(%splat, %size) : (!stream.resource<*>, index) -> !stream.resource<*>
+  util.return %splat, %result : !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
@@ -90,7 +90,7 @@
 
 // CHECK-LABEL: @blockArgMove
 // CHECK-SAME: (%[[COND:.+]]: i1
-func.func private @blockArgMove(%cond: i1, %size: index) -> (!stream.resource<*>, !stream.resource<*>) {
+util.func private @blockArgMove(%cond: i1, %size: index) -> (!stream.resource<*>, !stream.resource<*>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
@@ -118,5 +118,5 @@
   cf.cond_br %cond, ^bb1(%fill0, %bb1_1_new : !stream.resource<*>, !stream.resource<*>),
                  ^bb2(%fill0, %bb1_1_new : !stream.resource<*>, !stream.resource<*>)
 ^bb2(%bb2_0: !stream.resource<*>, %bb2_1: !stream.resource<*>):
-  return %bb2_0, %bb2_1 : !stream.resource<*>, !stream.resource<*>
+  util.return %bb2_0, %bb2_1 : !stream.resource<*>, !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_timepoints_coverage.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_timepoints_coverage.mlir
index c21c390..30aae28 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_timepoints_coverage.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_timepoints_coverage.mlir

@@ -16,15 +16,15 @@
 }
 
 // CHECK-LABEL: @initializedGlobals
-func.func private @initializedGlobals() -> !stream.timepoint {
+util.func private @initializedGlobals() -> !stream.timepoint {
   // CHECK: %[[GLOBAL0:.+]] = util.global.load @global0
   %global0 = util.global.load @global0 : !stream.timepoint
   // CHECK: %[[GLOBAL1:.+]] = util.global.load @global1
   %global1 = util.global.load @global1 : !stream.timepoint
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[GLOBAL0]], %[[GLOBAL1]])
   %join = stream.timepoint.join max(%global0, %global1) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
 
 // -----
@@ -32,11 +32,11 @@
 // Tests that meaningful timeline ops are never marked immediate.
 
 // CHECK-LABEL: @nonImmediate
-func.func private @nonImmediate() -> !stream.timepoint {
+util.func private @nonImmediate() -> !stream.timepoint {
   // CHECK: %[[EXECUTE:.+]] = stream.cmd.execute
   %0 = stream.cmd.execute with() {} => !stream.timepoint
-  // CHECK: return %[[EXECUTE]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[EXECUTE]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -45,7 +45,7 @@
 // by both %exec1a and %exec1b and does not need to be joined.
 
 // CHECK-LABEL: @joinChained
-func.func @joinChained() -> !stream.timepoint {
+util.func public @joinChained() -> !stream.timepoint {
   // CHECK: %[[EXEC0:.+]] = stream.cmd.execute with
   %exec0 = stream.cmd.execute with() {} => !stream.timepoint
   // CHECK: %[[EXEC1A:.+]] = stream.cmd.execute await(%[[EXEC0]])
@@ -55,8 +55,8 @@
   // CHECK: %[[EXEC0_IMM:.+]] = stream.timepoint.immediate
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[EXEC0_IMM]], %[[EXEC1A]], %[[EXEC1B]])
   %join = stream.timepoint.join max(%exec0, %exec1a, %exec1b) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
 
 // -----
@@ -66,7 +66,7 @@
 
 // CHECK-LABEL: @selectCovered
 // CHECK-SAME: (%[[COND:.+]]: i1)
-func.func @selectCovered(%cond: i1) -> !stream.timepoint {
+util.func public @selectCovered(%cond: i1) -> !stream.timepoint {
   // CHECK: %[[EXEC0:.+]] = stream.cmd.execute
   %exec0 = stream.cmd.execute with() {} => !stream.timepoint
   // CHECK: %[[EXEC1A:.+]] = stream.cmd.execute await(%[[EXEC0]])
@@ -78,8 +78,8 @@
   // CHECK: %[[EXEC0_IMM:.+]] = stream.timepoint.immediate
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[EXEC0_IMM]], %[[SELECT]])
   %join = stream.timepoint.join max(%exec0, %select) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
 
 // -----
@@ -87,25 +87,25 @@
 // Tests that a timepoint passed along a call edge is propagated.
 // %t0/%t1 are covered by the call result %call that joins the two together.
 
-// CHECK-LABEL: func @caller
+// CHECK-LABEL: util.func public @caller
 // CHECK-SAME: (%[[T0:.+]]: !stream.timepoint, %[[T1:.+]]: !stream.timepoint)
-func.func @caller(%t0: !stream.timepoint, %t1: !stream.timepoint) -> !stream.timepoint {
-  // CHECK: %[[CALL:.+]] = call @callee(%[[T0]], %[[T1]])
-  %call = call @callee(%t0, %t1) : (!stream.timepoint, !stream.timepoint) -> !stream.timepoint
+util.func public @caller(%t0: !stream.timepoint, %t1: !stream.timepoint) -> !stream.timepoint {
+  // CHECK: %[[CALL:.+]] = util.call @callee(%[[T0]], %[[T1]])
+  %call = util.call @callee(%t0, %t1) : (!stream.timepoint, !stream.timepoint) -> !stream.timepoint
   // CHECK-DAG: %[[T0_COVERED:.+]] = stream.timepoint.immediate
   // CHECK-DAG: %[[T1_COVERED:.+]] = stream.timepoint.immediate
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[T0_COVERED]], %[[T1_COVERED]], %[[CALL]])
   %join = stream.timepoint.join max(%t0, %t1, %call) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
-// CHECK-LABEL: func private @callee
-func.func private @callee(%t0a: !stream.timepoint, %t0b: !stream.timepoint) -> !stream.timepoint {
+// CHECK-LABEL: util.func private @callee
+util.func private @callee(%t0a: !stream.timepoint, %t0b: !stream.timepoint) -> !stream.timepoint {
   // CHECK-NOT: stream.timepoint.immediate
   // CHECK: %[[JOIN_CALLEE:.+]] = stream.timepoint.join max
   %t1 = stream.timepoint.join max(%t0a, %t0b) => !stream.timepoint
-  // CHECK: return %[[JOIN_CALLEE]]
-  return %t1 : !stream.timepoint
+  // CHECK: util.return %[[JOIN_CALLEE]]
+  util.return %t1 : !stream.timepoint
 }
 
 // -----
@@ -117,18 +117,18 @@
 // the same and instead just handle coverage (hitting either call results is
 // the same as hitting the original arg).
 
-// CHECK-LABEL: func @callerDupes
-func.func @callerDupes(%unknown: !stream.timepoint) -> !stream.timepoint {
-  // CHECK: %[[CALL:.+]]:2 = call @calleeDupes
-  %call:2 = call @calleeDupes(%unknown, %unknown) : (!stream.timepoint, !stream.timepoint) -> (!stream.timepoint, !stream.timepoint)
+// CHECK-LABEL: util.func public @callerDupes
+util.func public @callerDupes(%unknown: !stream.timepoint) -> !stream.timepoint {
+  // CHECK: %[[CALL:.+]]:2 = util.call @calleeDupes
+  %call:2 = util.call @calleeDupes(%unknown, %unknown) : (!stream.timepoint, !stream.timepoint) -> (!stream.timepoint, !stream.timepoint)
   // CHECK-NEXT: %[[UNKNOWN_IMM:.+]] = stream.timepoint.immediate
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[UNKNOWN_IMM]], %[[CALL]]#0, %[[CALL]]#1)
   %join = stream.timepoint.join max(%unknown, %call#0, %call#1) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
-func.func private @calleeDupes(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> (!stream.timepoint, !stream.timepoint) {
-  return %arg0, %arg1 : !stream.timepoint, !stream.timepoint
+util.func private @calleeDupes(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> (!stream.timepoint, !stream.timepoint) {
+  util.return %arg0, %arg1 : !stream.timepoint, !stream.timepoint
 }
 
 // -----
@@ -140,24 +140,24 @@
 // TODO(benvanik): we should also be able to trim the calls/t1 and only use
 // %t01 but that needs some work to know that call0 == t0 and call1 == t01.
 
-// CHECK-LABEL: func @nonUniformCaller
+// CHECK-LABEL: util.func public @nonUniformCaller
 // CHECK-SAME: (%[[T0:.+]]: !stream.timepoint, %[[T1:.+]]: !stream.timepoint)
-func.func @nonUniformCaller(%t0: !stream.timepoint, %t1: !stream.timepoint) -> !stream.timepoint {
-  // CHECK: %[[CALL0:.+]] = call @nonUniformCallee(%[[T0]])
-  %call0 = call @nonUniformCallee(%t0) : (!stream.timepoint) -> !stream.timepoint
+util.func public @nonUniformCaller(%t0: !stream.timepoint, %t1: !stream.timepoint) -> !stream.timepoint {
+  // CHECK: %[[CALL0:.+]] = util.call @nonUniformCallee(%[[T0]])
+  %call0 = util.call @nonUniformCallee(%t0) : (!stream.timepoint) -> !stream.timepoint
   // CHECK: %[[T01:.+]] = stream.timepoint.join max(%[[T0]], %[[T1]])
   %t01 = stream.timepoint.join max(%t0, %t1) => !stream.timepoint
-  // CHECK: %[[CALL1:.+]] = call @nonUniformCallee(%[[T01]])
-  %call1 = call @nonUniformCallee(%t01) : (!stream.timepoint) -> !stream.timepoint
+  // CHECK: %[[CALL1:.+]] = util.call @nonUniformCallee(%[[T01]])
+  %call1 = util.call @nonUniformCallee(%t01) : (!stream.timepoint) -> !stream.timepoint
   // CHECK: %[[T0_IMM:.+]] = stream.timepoint.immediate
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[T0_IMM]], %[[CALL0]], %[[T1]], %[[CALL1]])
   %join = stream.timepoint.join max(%t0, %call0, %t1, %call1) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
-// CHECK: func private @nonUniformCallee
-func.func private @nonUniformCallee(%arg0: !stream.timepoint) -> !stream.timepoint {
-  return %arg0 : !stream.timepoint
+// CHECK: util.func private @nonUniformCallee
+util.func private @nonUniformCallee(%arg0: !stream.timepoint) -> !stream.timepoint {
+  util.return %arg0 : !stream.timepoint
 }
 
 // -----
@@ -165,9 +165,9 @@
 // Tests that timepoints are tracked through branches args.
 // In this simple case %bb1_t0 always covers %t0.
 
-// CHECK-LABEL: func @branch
+// CHECK-LABEL: util.func public @branch
 // CHECK-SAME: (%[[T0:.+]]: !stream.timepoint)
-func.func @branch(%t0: !stream.timepoint) -> !stream.timepoint {
+util.func public @branch(%t0: !stream.timepoint) -> !stream.timepoint {
   // CHECK: cf.br ^bb1
   cf.br ^bb1(%t0 : !stream.timepoint)
 // CHECK-NEXT: ^bb1(%[[BB1_T0:.+]]: !stream.timepoint)
@@ -175,8 +175,8 @@
   // CHECK: %[[T0_IMM:.+]] = stream.timepoint.immediate
   // CHECK-NEXT: %[[JOIN:.+]] = stream.timepoint.join max(%[[T0_IMM]], %[[BB1_T0]])
   %join = stream.timepoint.join max(%t0, %bb1_t0) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
 
 // -----
@@ -184,9 +184,9 @@
 // Tests that forward edges with convergent timepoints track coverage.
 // Here both true and false paths cover %t0 and it can be elided at the join.
 
-// CHECK-LABEL: func @branchConvergentForwardEdge
+// CHECK-LABEL: util.func public @branchConvergentForwardEdge
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[T0:.+]]: !stream.timepoint)
-func.func @branchConvergentForwardEdge(%cond: i1, %t0: !stream.timepoint) -> !stream.timepoint {
+util.func public @branchConvergentForwardEdge(%cond: i1, %t0: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[T1A:.+]] = stream.cmd.execute await(%[[T0]])
   %t1a = stream.cmd.execute await(%t0) => with() {} => !stream.timepoint
   // CHECK: %[[T1B:.+]] = stream.cmd.execute await(%[[T0]])
@@ -200,8 +200,8 @@
   // CHECK: %[[T0_IMM:.+]] = stream.timepoint.immediate
   // CHECK-NEXT: %[[JOIN:.+]] = stream.timepoint.join max(%[[T0_IMM]], %[[BB1_ARG]])
   %join = stream.timepoint.join max(%t0, %bb1_arg) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
 
 // -----
@@ -209,9 +209,9 @@
 // Tests that forward edges with divergent timepoint coverage get propagated.
 // %t0 is covered on both paths but %t1 is only covered when %cond == true.
 
-// CHECK-LABEL: func @branchDivergentForwardEdge
+// CHECK-LABEL: util.func public @branchDivergentForwardEdge
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[T0:.+]]: !stream.timepoint, %[[T1:.+]]: !stream.timepoint)
-func.func @branchDivergentForwardEdge(%cond: i1, %t0: !stream.timepoint, %t1: !stream.timepoint) -> !stream.timepoint {
+util.func public @branchDivergentForwardEdge(%cond: i1, %t0: !stream.timepoint, %t1: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[T01:.+]] = stream.timepoint.join max(%[[T0]], %[[T1]])
   %t01 = stream.timepoint.join max(%t0, %t1) => !stream.timepoint
   // CHECK-NEXT: cf.cond_br
@@ -223,8 +223,8 @@
   // CHECK: %[[T0_IMM:.+]] = stream.timepoint.immediate
   // CHECK-NEXT: %[[JOIN:.+]] = stream.timepoint.join max(%[[T0_IMM]], %[[T1]], %[[BB1_ARG]])
   %join = stream.timepoint.join max(%t0, %t1, %bb1_arg) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
 
 // -----
@@ -236,9 +236,9 @@
 // must-be-executed-context-like machinery in order to do so. We just want to
 // make sure we're preserving the timepoints here for correctness.
 
-// CHECK-LABEL: func @branchDivergentBackEdge
+// CHECK-LABEL: util.func public @branchDivergentBackEdge
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[T0:.+]]: !stream.timepoint)
-func.func @branchDivergentBackEdge(%cond: i1, %t0: !stream.timepoint) -> !stream.timepoint {
+util.func public @branchDivergentBackEdge(%cond: i1, %t0: !stream.timepoint) -> !stream.timepoint {
   // CHECK: cf.br ^bb1
   cf.br ^bb1(%cond, %t0 : i1, !stream.timepoint)
 // CHECK-NEXT: ^bb1(%[[BB1_COND:.+]]: i1, %[[BB1_T0:.+]]: !stream.timepoint)
@@ -256,8 +256,8 @@
 ^bb2(%bb2_t1: !stream.timepoint):
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[T0]], %[[BB2_T1]])
   %join = stream.timepoint.join max(%t0, %bb2_t1) => !stream.timepoint
-  // CHECK-NEXT: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK-NEXT: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
 
 // -----
@@ -265,9 +265,9 @@
 // Tests that scf.if regions with convergent yields are handled.
 // Here %t0 is covered regardless of the %cond and can be elided.
 
-// CHECK-LABEL: func @scfIfConvergent
+// CHECK-LABEL: util.func public @scfIfConvergent
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[T0:.+]]: !stream.timepoint, %[[T1:.+]]: !stream.timepoint)
-func.func @scfIfConvergent(%cond: i1, %t0: !stream.timepoint, %t1: !stream.timepoint) -> !stream.timepoint {
+util.func public @scfIfConvergent(%cond: i1, %t0: !stream.timepoint, %t1: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[IF:.+]] = scf.if
   %if = scf.if %cond -> !stream.timepoint {
     // CHECK: yield %[[T0]]
@@ -281,8 +281,8 @@
   // CHECK: %[[T0_IMM:.+]] = stream.timepoint.immediate
   // CHECK-NEXT: %[[JOIN:.+]] = stream.timepoint.join max(%[[T0_IMM]], %[[T1]], %[[IF]])
   %join = stream.timepoint.join max(%t0, %t1, %if) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %join : !stream.timepoint
 }
 
 // TODO(benvanik): support scf.for

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_timepoints_immediate.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_timepoints_immediate.mlir
index 147c1c6..aa058c5 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_timepoints_immediate.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_timepoints_immediate.mlir

@@ -3,14 +3,14 @@
 // Tests that joins with multiple immediate timepoints are marked as immediate.
 
 // CHECK-LABEL: @immediateJoin
-func.func private @immediateJoin() -> !stream.timepoint {
+util.func private @immediateJoin() -> !stream.timepoint {
   %imm0 = stream.timepoint.immediate => !stream.timepoint
   %imm1 = stream.timepoint.immediate => !stream.timepoint
   // CHECK: stream.timepoint.join
   // CHECK-NEXT: %[[JOIN_IMM:.+]] = stream.timepoint.immediate
   %0 = stream.timepoint.join max(%imm0, %imm1) => !stream.timepoint
-  // CHECK: return %[[JOIN_IMM]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[JOIN_IMM]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -19,13 +19,13 @@
 
 // CHECK-LABEL: @nonImmediateJoin
 // CHECK-SAME: (%[[NON_IMM:.+]]: !stream.timepoint)
-func.func @nonImmediateJoin(%arg0: !stream.timepoint) -> !stream.timepoint {
+util.func public @nonImmediateJoin(%arg0: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
   %imm = stream.timepoint.immediate => !stream.timepoint
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[NON_IMM]], %[[IMM]])
   %0 = stream.timepoint.join max(%arg0, %imm) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -33,14 +33,14 @@
 // Tests that a select between immediate values is marked immediate.
 
 // CHECK-LABEL: @selectSame
-func.func @selectSame(%cond: i1) -> !stream.timepoint {
+util.func public @selectSame(%cond: i1) -> !stream.timepoint {
   %imm0 = stream.timepoint.immediate => !stream.timepoint
   %imm1 = stream.timepoint.immediate => !stream.timepoint
   // CHECK: arith.select
   // CHECK-NEXT: %[[SELECT_IMM:.+]] = stream.timepoint.immediate
   %select = arith.select %cond, %imm0, %imm1 : !stream.timepoint
-  // CHECK: return %[[SELECT_IMM]]
-  return %select : !stream.timepoint
+  // CHECK: util.return %[[SELECT_IMM]]
+  util.return %select : !stream.timepoint
 }
 
 // -----
@@ -48,12 +48,12 @@
 // Tests that a select with one or more unknown value is not marked immediate.
 
 // CHECK-LABEL: @selectDifferent
-func.func @selectDifferent(%cond: i1, %unknown: !stream.timepoint) -> !stream.timepoint {
+util.func public @selectDifferent(%cond: i1, %unknown: !stream.timepoint) -> !stream.timepoint {
   %imm = stream.timepoint.immediate => !stream.timepoint
   // CHECK: %[[SELECT:.+]] = arith.select
   %select = arith.select %cond, %imm, %unknown : !stream.timepoint
-  // CHECK: return %[[SELECT]]
-  return %select : !stream.timepoint
+  // CHECK: util.return %[[SELECT]]
+  util.return %select : !stream.timepoint
 }
 
 // -----
@@ -63,11 +63,11 @@
 util.global private mutable @global = #stream.timepoint<immediate> : !stream.timepoint
 
 // CHECK-LABEL: @immediateGlobal
-func.func private @immediateGlobal() -> !stream.timepoint {
+util.func private @immediateGlobal() -> !stream.timepoint {
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
   %global = util.global.load @global : !stream.timepoint
-  // CHECK: return %[[IMM]]
-  return %global : !stream.timepoint
+  // CHECK: util.return %[[IMM]]
+  util.return %global : !stream.timepoint
 }
 
 // -----
@@ -77,19 +77,19 @@
 util.global private mutable @global : !stream.timepoint
 
 // CHECK-LABEL: @uniformGlobal
-func.func private @uniformGlobal() -> !stream.timepoint {
+util.func private @uniformGlobal() -> !stream.timepoint {
   %imm = stream.timepoint.immediate => !stream.timepoint
   util.global.store %imm, @global : !stream.timepoint
   // CHECK: util.global.load
   %global = util.global.load @global : !stream.timepoint
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
-  // CHECK: return %[[IMM]]
-  return %global : !stream.timepoint
+  // CHECK: util.return %[[IMM]]
+  util.return %global : !stream.timepoint
 }
-func.func private @globalSetter() {
+util.func private @globalSetter() {
   %imm = stream.timepoint.immediate => !stream.timepoint
   util.global.store %imm, @global : !stream.timepoint
-  return
+  util.return
 }
 
 // -----
@@ -99,16 +99,16 @@
 util.global private mutable @global = #stream.timepoint<immediate> : !stream.timepoint
 
 // CHECK-LABEL: @nonUniformGlobal
-func.func private @nonUniformGlobal() -> !stream.timepoint {
+util.func private @nonUniformGlobal() -> !stream.timepoint {
   // CHECK-NOT: stream.timepoint.immediate
   // CHECK: %[[GLOBAL:.+]] = util.global.load @global
   %global = util.global.load @global : !stream.timepoint
-  // CHECK: return %[[GLOBAL]]
-  return %global : !stream.timepoint
+  // CHECK: util.return %[[GLOBAL]]
+  util.return %global : !stream.timepoint
 }
-func.func @globalSetter(%arg0: !stream.timepoint) {
+util.func public @globalSetter(%arg0: !stream.timepoint) {
   util.global.store %arg0, @global : !stream.timepoint
-  return
+  util.return
 }
 
 // -----
@@ -116,92 +116,92 @@
 // Tests that meaningful timeline ops are never marked immediate.
 
 // CHECK-LABEL: @nonImmediate
-func.func private @nonImmediate() -> !stream.timepoint {
+util.func private @nonImmediate() -> !stream.timepoint {
   %imm = stream.timepoint.immediate => !stream.timepoint
   // CHECK: %[[EXECUTE:.+]] = stream.cmd.execute
   %0 = stream.cmd.execute await(%imm) => with() {} => !stream.timepoint
-  // CHECK: return %[[EXECUTE]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[EXECUTE]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // Tests that an immediate timepoint passed along a call edge is propagated.
 
-// CHECK-LABEL: func @caller
-func.func @caller() -> !stream.timepoint {
+// CHECK-LABEL: util.func public @caller
+util.func public @caller() -> !stream.timepoint {
   // CHECK: %[[T0_IMM:.+]] = stream.timepoint.immediate
   %t0 = stream.timepoint.immediate => !stream.timepoint
-  // CHECK: %[[T1:.+]] = call @callee(%[[T0_IMM]], %[[T0_IMM]])
+  // CHECK: %[[T1:.+]] = util.call @callee(%[[T0_IMM]], %[[T0_IMM]])
   // CHECK-NEXT: %[[T1_IMM:.+]] = stream.timepoint.immediate
-  %t1 = call @callee(%t0, %t0) : (!stream.timepoint, !stream.timepoint) -> !stream.timepoint
+  %t1 = util.call @callee(%t0, %t0) : (!stream.timepoint, !stream.timepoint) -> !stream.timepoint
   // CHECK: %[[T2:.+]] = stream.timepoint.join max(%[[T0_IMM]], %[[T1_IMM]])
   // CHECK-NEXT: %[[T2_IMM:.+]] = stream.timepoint.immediate
   %t2 = stream.timepoint.join max(%t0, %t1) => !stream.timepoint
-  // CHECK: return %[[T2_IMM]]
-  return %t2 : !stream.timepoint
+  // CHECK: util.return %[[T2_IMM]]
+  util.return %t2 : !stream.timepoint
 }
-// CHECK-LABEL: func private @callee
-func.func private @callee(%t0a: !stream.timepoint, %t0b: !stream.timepoint) -> !stream.timepoint {
+// CHECK-LABEL: util.func private @callee
+util.func private @callee(%t0a: !stream.timepoint, %t0b: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[T0A_IMM:.+]] = stream.timepoint.immediate
   // CHECK: %[[T0B_IMM:.+]] = stream.timepoint.immediate
   // CHECK-NEXT: %[[T1:.+]] = stream.timepoint.join max(%[[T0A_IMM]], %[[T0B_IMM]])
   %t1 = stream.timepoint.join max(%t0a, %t0b) => !stream.timepoint
   // CHECK-NEXT: %[[T1_IMM:.+]] = stream.timepoint.immediate
-  // CHECK-NEXT: return %[[T1_IMM]]
-  return %t1 : !stream.timepoint
+  // CHECK-NEXT: util.return %[[T1_IMM]]
+  util.return %t1 : !stream.timepoint
 }
 
 // -----
 
 // Tests that duplicate call args/results are handled correctly.
 
-// CHECK-LABEL: func @callerDupes
-func.func @callerDupes() -> !stream.timepoint {
+// CHECK-LABEL: util.func public @callerDupes
+util.func public @callerDupes() -> !stream.timepoint {
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
   %imm = stream.timepoint.immediate => !stream.timepoint
-  // CHECK: %[[CALL:.+]]:2 = call @calleeDupes
+  // CHECK: %[[CALL:.+]]:2 = util.call @calleeDupes
   // CHECK-NEXT: %[[CALL_IMM0:.+]] = stream.timepoint.immediate
   // CHECK-NEXT: %[[CALL_IMM1:.+]] = stream.timepoint.immediate
-  %call:2 = call @calleeDupes(%imm, %imm) : (!stream.timepoint, !stream.timepoint) -> (!stream.timepoint, !stream.timepoint)
+  %call:2 = util.call @calleeDupes(%imm, %imm) : (!stream.timepoint, !stream.timepoint) -> (!stream.timepoint, !stream.timepoint)
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[CALL_IMM0]], %[[CALL_IMM1]])
   // CHECK-NEXT: %[[JOIN_IMM:.+]] = stream.timepoint.immediate
   %join = stream.timepoint.join max(%call#0, %call#1) => !stream.timepoint
-  // CHECK: return %[[JOIN_IMM]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN_IMM]]
+  util.return %join : !stream.timepoint
 }
-func.func private @calleeDupes(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> (!stream.timepoint, !stream.timepoint) {
-  return %arg0, %arg1 : !stream.timepoint, !stream.timepoint
+util.func private @calleeDupes(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> (!stream.timepoint, !stream.timepoint) {
+  util.return %arg0, %arg1 : !stream.timepoint, !stream.timepoint
 }
 
 // -----
 
 // Tests that convergent caller timepoints are handled correctly.
 
-// CHECK-LABEL: func @uniformCaller
-func.func @uniformCaller() -> !stream.timepoint {
+// CHECK-LABEL: util.func public @uniformCaller
+util.func public @uniformCaller() -> !stream.timepoint {
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
   %imm = stream.timepoint.immediate => !stream.timepoint
   // CHECK-NEXT: call @uniformCallee(%[[IMM]])
   // CHECK-NEXT: %[[CALL_IMM0:.+]] = stream.timepoint.immediate
-  %call0 = call @uniformCallee(%imm) : (!stream.timepoint) -> !stream.timepoint
+  %call0 = util.call @uniformCallee(%imm) : (!stream.timepoint) -> !stream.timepoint
   // CHECK-NEXT: call @uniformCallee(%[[IMM]])
   // CHECK-NEXT: %[[CALL_IMM1:.+]] = stream.timepoint.immediate
-  %call1 = call @uniformCallee(%imm) : (!stream.timepoint) -> !stream.timepoint
+  %call1 = util.call @uniformCallee(%imm) : (!stream.timepoint) -> !stream.timepoint
   // CHECK-NEXT: %[[CALLER_JOIN:.+]] = stream.timepoint.join max(%[[CALL_IMM0]], %[[CALL_IMM1]])
   // CHECK-NEXT: %[[CALLER_JOIN_IMM:.+]] = stream.timepoint.immediate
   %join = stream.timepoint.join max(%call0, %call1) => !stream.timepoint
-  // CHECK: return %[[CALLER_JOIN_IMM]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[CALLER_JOIN_IMM]]
+  util.return %join : !stream.timepoint
 }
-// CHECK: func private @uniformCallee
-func.func private @uniformCallee(%arg0: !stream.timepoint) -> !stream.timepoint {
+// CHECK: util.func private @uniformCallee
+util.func private @uniformCallee(%arg0: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[ARG0_IMM:.+]] = stream.timepoint.immediate
   // CHECK: %[[CALLEE_JOIN:.+]] = stream.timepoint.join max(%[[ARG0_IMM]])
   // CHECK-NEXT: %[[CALLEE_JOIN_IMM:.+]] = stream.timepoint.immediate
   %0 = stream.timepoint.join max(%arg0) => !stream.timepoint
-  // CHECK: return %[[CALLEE_JOIN_IMM]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[CALLEE_JOIN_IMM]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -211,37 +211,37 @@
 // should be immediate - today, though, we aggregate over callers and any one
 // that may pass a non-immediate poisons the analysis.
 
-// CHECK-LABEL: func @nonUniformCaller
+// CHECK-LABEL: util.func public @nonUniformCaller
 // CHECK-SAME: (%[[UNKNOWN:.+]]: !stream.timepoint)
-func.func @nonUniformCaller(%unknown: !stream.timepoint) -> !stream.timepoint {
+util.func public @nonUniformCaller(%unknown: !stream.timepoint) -> !stream.timepoint {
   // CHECK-NOT: stream.timepoint.immediate
-  // CHECK: %[[CALL0:.+]] = call @nonUniformCallee(%[[UNKNOWN]])
-  %call0 = call @nonUniformCallee(%unknown) : (!stream.timepoint) -> !stream.timepoint
+  // CHECK: %[[CALL0:.+]] = util.call @nonUniformCallee(%[[UNKNOWN]])
+  %call0 = util.call @nonUniformCallee(%unknown) : (!stream.timepoint) -> !stream.timepoint
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
   %imm = stream.timepoint.immediate => !stream.timepoint
-  // CHECK: %[[CALL1:.+]] = call @nonUniformCallee(%[[IMM]])
-  %call1 = call @nonUniformCallee(%imm) : (!stream.timepoint) -> !stream.timepoint
+  // CHECK: %[[CALL1:.+]] = util.call @nonUniformCallee(%[[IMM]])
+  %call1 = util.call @nonUniformCallee(%imm) : (!stream.timepoint) -> !stream.timepoint
   // CHECK: %[[CALLER_JOIN:.+]] = stream.timepoint.join max(%[[CALL0]], %[[CALL1]])
   %join = stream.timepoint.join max(%call0, %call1) => !stream.timepoint
-  // CHECK: return %[[CALLER_JOIN]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[CALLER_JOIN]]
+  util.return %join : !stream.timepoint
 }
-// CHECK-LABEL: func private @nonUniformCallee
+// CHECK-LABEL: util.func private @nonUniformCallee
 // CHECK-SAME: (%[[CALLEE_ARG:.+]]: !stream.timepoint)
-func.func private @nonUniformCallee(%arg0: !stream.timepoint) -> !stream.timepoint {
+util.func private @nonUniformCallee(%arg0: !stream.timepoint) -> !stream.timepoint {
   // CHECK-NOT: stream.timepoint.immediate
   // CHECK: %[[CALLEE_JOIN:.+]] = stream.timepoint.join max(%[[CALLEE_ARG]])
   %0 = stream.timepoint.join max(%arg0) => !stream.timepoint
-  // CHECK: return %[[CALLEE_JOIN]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[CALLEE_JOIN]]
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // Tests that an immediate timepoint passed along a block edge is propagated.
 
-// CHECK-LABEL: func @branch
-func.func @branch() -> !stream.timepoint {
+// CHECK-LABEL: util.func public @branch
+util.func public @branch() -> !stream.timepoint {
   %t0 = stream.timepoint.immediate => !stream.timepoint
   // CHECK: cf.br ^bb1
   cf.br ^bb1(%t0 : !stream.timepoint)
@@ -251,8 +251,8 @@
   // CHECK-NEXT: %[[T1:.+]] = stream.timepoint.join max(%[[BB1_T0_IMMEDIATE]])
   %t1 = stream.timepoint.join max(%bb1_t0) => !stream.timepoint
   // CHECK-NEXT: %[[JOIN_IMMEDIATE:.+]] = stream.timepoint.immediate
-  // CHECK-NEXT: return %[[JOIN_IMMEDIATE]]
-  return %t1 : !stream.timepoint
+  // CHECK-NEXT: util.return %[[JOIN_IMMEDIATE]]
+  util.return %t1 : !stream.timepoint
 }
 
 // -----
@@ -260,8 +260,8 @@
 // Tests that forward edges with convergently immediate timepoints get
 // propagated.
 
-// CHECK-LABEL: func @branchConvergentForwardEdge
-func.func @branchConvergentForwardEdge(%cond: i1) -> !stream.timepoint {
+// CHECK-LABEL: util.func public @branchConvergentForwardEdge
+util.func public @branchConvergentForwardEdge(%cond: i1) -> !stream.timepoint {
   // CHECK: %[[IMM0:.+]] = stream.timepoint.immediate
   %imm0 = stream.timepoint.immediate => !stream.timepoint
   // CHECK: %[[IMM1:.+]] = stream.timepoint.immediate
@@ -273,17 +273,17 @@
 // CHECK-NEXT: ^bb1(%[[BB1_ARG:.+]]: !stream.timepoint)
 ^bb1(%bb1_arg: !stream.timepoint):
   // CHECK: %[[BB1_IMM:.+]] = stream.timepoint.immediate
-  // CHECK: return %[[BB1_IMM]]
-  return %bb1_arg : !stream.timepoint
+  // CHECK: util.return %[[BB1_IMM]]
+  util.return %bb1_arg : !stream.timepoint
 }
 
 // -----
 
 // Tests that forward edges with divergent timepoints don't get propagated.
 
-// CHECK-LABEL: func @branchDivergentForwardEdge
+// CHECK-LABEL: util.func public @branchDivergentForwardEdge
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[UNKNOWN:.+]]: !stream.timepoint)
-func.func @branchDivergentForwardEdge(%cond: i1, %unknown: !stream.timepoint) -> !stream.timepoint {
+util.func public @branchDivergentForwardEdge(%cond: i1, %unknown: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
   %imm = stream.timepoint.immediate => !stream.timepoint
   // CHECK-NEXT: cf.cond_br %[[COND]]
@@ -292,16 +292,16 @@
   cf.cond_br %cond, ^bb1(%unknown : !stream.timepoint), ^bb1(%imm : !stream.timepoint)
 // CHECK-NEXT: ^bb1(%[[BB1_ARG:.+]]: !stream.timepoint)
 ^bb1(%bb1_arg: !stream.timepoint):
-  // CHECK: return %[[BB1_ARG]]
-  return %bb1_arg : !stream.timepoint
+  // CHECK: util.return %[[BB1_ARG]]
+  util.return %bb1_arg : !stream.timepoint
 }
 
 // -----
 
 // Tests that back edges with divergent timepoints don't get propagated.
 
-// CHECK-LABEL: func @branchDivergentBackEdge
-func.func @branchDivergentBackEdge(%cond: i1) -> !stream.timepoint {
+// CHECK-LABEL: util.func public @branchDivergentBackEdge
+util.func public @branchDivergentBackEdge(%cond: i1) -> !stream.timepoint {
   %t0 = stream.timepoint.immediate => !stream.timepoint
   // CHECK: cf.br ^bb1
   cf.br ^bb1(%cond, %t0 : i1, !stream.timepoint)
@@ -318,17 +318,17 @@
   cf.cond_br %bb1_cond, ^bb1(%cond_false, %bb1_t1 : i1, !stream.timepoint), ^bb2(%bb1_t1 : !stream.timepoint)
 // CHECK-NEXT: ^bb2(%[[BB2_T1:.+]]: !stream.timepoint)
 ^bb2(%bb2_t1: !stream.timepoint):
-  // CHECK-NEXT: return %[[BB2_T1]]
-  return %bb2_t1 : !stream.timepoint
+  // CHECK-NEXT: util.return %[[BB2_T1]]
+  util.return %bb2_t1 : !stream.timepoint
 }
 
 // -----
 
 // Tests that scf.if regions with convergent yields are handled.
 
-// CHECK-LABEL: func @scfIfConvergent
+// CHECK-LABEL: util.func public @scfIfConvergent
 // CHECK-SAME: (%[[COND:.+]]: i1)
-func.func @scfIfConvergent(%cond: i1) -> !stream.timepoint {
+util.func public @scfIfConvergent(%cond: i1) -> !stream.timepoint {
   // CHECK: %[[IF:.+]] = scf.if
   %if = scf.if %cond -> !stream.timepoint {
     // CHECK: %[[IMM0:.+]] = stream.timepoint.immediate
@@ -345,17 +345,17 @@
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[IF_IMM]])
   // CHECK-NEXT: %[[JOIN_IMM:.+]] = stream.timepoint.immediate
   %join = stream.timepoint.join max(%if) => !stream.timepoint
-  // CHECK: return %[[JOIN_IMM]]
-  return %join : !stream.timepoint
+  // CHECK: util.return %[[JOIN_IMM]]
+  util.return %join : !stream.timepoint
 }
 
 // -----
 
 // Tests that scf.if regions with divergent yields are handled.
 
-// CHECK-LABEL: func @scfIfDivergent
+// CHECK-LABEL: util.func public @scfIfDivergent
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[UNKNOWN:.+]]: !stream.timepoint)
-func.func @scfIfDivergent(%cond: i1, %unknown: !stream.timepoint) -> !stream.timepoint {
+util.func public @scfIfDivergent(%cond: i1, %unknown: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[IMM:.+]] = stream.timepoint.immediate
   %imm = stream.timepoint.immediate => !stream.timepoint
   // CHECK: %[[IF:.+]] = scf.if
@@ -371,8 +371,8 @@
   // CHECK-NOT: stream.timepoint.immediate
   // CHECK: %[[JOIN_OUTER:.+]] = stream.timepoint.join max(%[[UNKNOWN]], %[[IF]])
   %join_outer = stream.timepoint.join max(%unknown, %0) => !stream.timepoint
-  // CHECK: return %[[JOIN_OUTER]]
-  return %join_outer : !stream.timepoint
+  // CHECK: util.return %[[JOIN_OUTER]]
+  util.return %join_outer : !stream.timepoint
 }
 
 // TODO(benvanik): support scf.for

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/emplace_allocations.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/emplace_allocations.mlir
index 58bfef1..3a10e50 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/emplace_allocations.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/emplace_allocations.mlir

@@ -1,9 +1,9 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-stream-emplace-allocations))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module( util.func(iree-stream-emplace-allocations))' %s | FileCheck %s
 
 // Tests that a dispatch result is placed into the target of an update.
 
 // CHECK-LABEL: @emplaceDispatch
-func.func @emplaceDispatch(
+util.func public @emplaceDispatch(
     // CHECK-SAME: %[[INPUT:arg[0-9]+]]: !stream.resource<*>, %[[INPUT_SIZE:arg[0-9]+]]: index,
     %input: !stream.resource<*>, %input_size: index,
     // CHECK-SAME: %[[UPDATE_OFFSET:arg[0-9]+]]: index, %[[UPDATE_SIZE:arg[0-9]+]]: index,
@@ -18,8 +18,8 @@
   %update = stream.async.dispatch @ex::@dispatch(%input[%c0 to %input_size for %input_size]) : (!stream.resource<*>{%input_size}) -> !stream.resource<*>{%update_size}
   // CHECK-NOT: stream.async.update
   %result = stream.async.update %update, %target[%update_offset to %update_end] : !stream.resource<*>{%update_size} -> %target as !stream.resource<*>{%target_size}
-  // CHECK: return %[[RESULT]]
-  return %result : !stream.resource<*>
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -29,7 +29,7 @@
 // if the dispatch requires in-place operation that may not be safe.
 
 // CHECK-LABEL: @dontEmplaceTiedDispatch
-func.func @dontEmplaceTiedDispatch(
+util.func public @dontEmplaceTiedDispatch(
     %tied_input: !stream.resource<*>, %tied_input_size: index,
     %update_offset: index, %update_size: index,
     %target: !stream.resource<*>, %target_size: index) -> !stream.resource<*> {
@@ -39,8 +39,8 @@
   %update = stream.async.dispatch @ex::@dispatch(%tied_input[%c0 to %tied_input_size for %tied_input_size]) : (!stream.resource<*>{%tied_input_size}) -> %tied_input{%tied_input_size}
   // CHECK: %[[RESULT:.+]] = stream.async.update %[[TIED_RESULT]]
   %result = stream.async.update %update, %target[%update_offset to %update_end] : !stream.resource<*>{%tied_input_size} -> %target as !stream.resource<*>{%target_size}
-  // CHECK: return %[[RESULT]]
-  return %result : !stream.resource<*>
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -50,7 +50,7 @@
 // test that explicitly as it's 95% of what this pass is designed to optimize.
 
 // CHECK-LABEL: @emplaceDispatchSequence
-func.func @emplaceDispatchSequence(
+util.func public @emplaceDispatchSequence(
     // CHECK-SAME: %[[INPUT:arg[0-9]+]]: !stream.resource<*>, %[[INPUT_SIZE:arg[0-9]+]]: index,
     %input: !stream.resource<*>, %input_size: index,
     // CHECK-SAME: %[[UPDATE_SIZE:arg[0-9]+]]: index, %[[TARGET_SIZE:arg[0-9]+]]: index
@@ -79,8 +79,8 @@
   %target2 = stream.async.update %update2, %target1[%c98304 to %c147456] : !stream.resource<*>{%update_size} -> %target1 as !stream.resource<*>{%target_size}
   // CHECK-NOT: stream.async.update
   %target3 = stream.async.update %update3, %target2[%c147456 to %c196608] : !stream.resource<*>{%update_size} -> %target2 as !stream.resource<*>{%target_size}
-  // CHECK: return %[[TARGET3]]
-  return %target3 : !stream.resource<*>
+  // CHECK: util.return %[[TARGET3]]
+  util.return %target3 : !stream.resource<*>
 }
 
 // -----
@@ -89,7 +89,7 @@
 // dependencies shouldn't stop us from emplacing.
 
 // CHECK-LABEL: @emplaceMultiResultDispatchSequence
-func.func @emplaceMultiResultDispatchSequence(
+util.func public @emplaceMultiResultDispatchSequence(
     // CHECK-SAME: %[[INPUT:arg[0-9]+]]: !stream.resource<*>, %[[INPUT_SIZE:arg[0-9]+]]: index,
     %input: !stream.resource<*>, %input_size: index,
     // CHECK-SAME: %[[UPDATE_SIZE:arg[0-9]+]]: index, %[[TARGET_SIZE:arg[0-9]+]]: index
@@ -118,8 +118,8 @@
   %target2 = stream.async.update %update2, %target1[%c98304 to %c147456] : !stream.resource<*>{%update_size} -> %target1 as !stream.resource<*>{%target_size}
   // CHECK-NOT: stream.async.update
   %target3 = stream.async.update %update3, %target2[%c147456 to %c196608] : !stream.resource<*>{%update_size} -> %target2 as !stream.resource<*>{%target_size}
-  // CHECK: return %[[TARGET3]]
-  return %target3 : !stream.resource<*>
+  // CHECK: util.return %[[TARGET3]]
+  util.return %target3 : !stream.resource<*>
 }
 
 // -----
@@ -129,7 +129,7 @@
 // into the same dispatch.
 
 // CHECK-LABEL: @emplaceMultiResultDispatchInto
-func.func @emplaceMultiResultDispatchInto(
+util.func public @emplaceMultiResultDispatchInto(
     // CHECK-SAME: %[[INPUT:arg[0-9]+]]: !stream.resource<*>, %[[INPUT_SIZE:arg[0-9]+]]: index,
     %input: !stream.resource<*>, %input_size: index,
     // CHECK-SAME: %[[UPDATE_SIZE:arg[0-9]+]]: index, %[[TARGET_SIZE:arg[0-9]+]]: index
@@ -149,8 +149,8 @@
   %target0 = stream.async.update %update#0, %target[%c0 to %c32] : !stream.resource<*>{%update_size} -> %target as !stream.resource<*>{%target_size}
   // CHECK-NOT: stream.async.update
   %target1 = stream.async.update %update#1, %target0[%c32 to %c64] : !stream.resource<*>{%update_size} -> %target0 as !stream.resource<*>{%target_size}
-  // CHECK: return %[[DISPATCH]]#1
-  return %target1 : !stream.resource<*>
+  // CHECK: util.return %[[DISPATCH]]#1
+  util.return %target1 : !stream.resource<*>
 }
 
 // -----
@@ -163,7 +163,7 @@
 // third as the second isn't placed.
 
 // CHECK-LABEL: @dontEmplaceSparseMultiResult
-func.func @dontEmplaceSparseMultiResult(
+util.func public @dontEmplaceSparseMultiResult(
     // CHECK-SAME: %[[INPUT:arg[0-9]+]]: !stream.resource<*>, %[[INPUT_SIZE:arg[0-9]+]]: index,
     %input: !stream.resource<*>, %input_size: index,
     // CHECK-SAME: %[[UPDATE_SIZE:arg[0-9]+]]: index, %[[TARGET_SIZE:arg[0-9]+]]: index
@@ -183,8 +183,8 @@
   %target0 = stream.async.update %update#0, %target[%c0 to %c32] : !stream.resource<*>{%update_size} -> %target as !stream.resource<*>{%target_size}
   // CHECK: %[[TARGET1:.+]] = stream.async.update %[[DISPATCH]]#2, %[[DISPATCH]]#0[%c32 to %c64]
   %target1 = stream.async.update %update#2, %target0[%c32 to %c64] : !stream.resource<*>{%update_size} -> %target0 as !stream.resource<*>{%target_size}
-  // CHECK: return %[[TARGET1]]
-  return %target1 : !stream.resource<*>
+  // CHECK: util.return %[[TARGET1]]
+  util.return %target1 : !stream.resource<*>
 }
 
 // -----
@@ -192,7 +192,7 @@
 // Tests that sequences with data dependencies don't hoist beyond them.
 
 // CHECK-LABEL: @emplaceDependentDispatchSequence
-func.func @emplaceDependentDispatchSequence(
+util.func public @emplaceDependentDispatchSequence(
     // CHECK-SAME: %[[INPUT:arg[0-9]+]]: !stream.resource<*>, %[[INPUT_SIZE:arg[0-9]+]]: index,
     %input: !stream.resource<*>, %input_size: index,
     // CHECK-SAME: %[[UPDATE_SIZE:arg[0-9]+]]: index, %[[TARGET_SIZE:arg[0-9]+]]: index
@@ -213,6 +213,6 @@
   %target0 = stream.async.update %update0, %target[%c0 to %c49152] : !stream.resource<*>{%update_size} -> %target as !stream.resource<*>{%target_size}
   // CHECK-NEXT: %[[TARGET1:.+]] = stream.async.update %[[UPDATE1]], %[[TARGET0]]
   %target1 = stream.async.update %update1, %target0[%c49152 to %c98304] : !stream.resource<*>{%update_size} -> %target0 as !stream.resource<*>{%target_size}
-  // CHECK-NEXT: return %[[TARGET1]]
-  return %target1 : !stream.resource<*>
+  // CHECK-NEXT: util.return %[[TARGET1]]
+  util.return %target1 : !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors.mlir
index a9e57f0..06c0700 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors.mlir

@@ -4,7 +4,7 @@
 stream.executable private @convert_load_i1 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       %c0 = arith.constant 0 : index
       // CHECK: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<readonly:tensor<4xi8>>
       %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xi1>>
@@ -13,7 +13,7 @@
       %tile = flow.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xi1>> -> tensor<?xi1>
       // CHECK: util.optimization_barrier %[[TILE_I1]]
       util.optimization_barrier %tile : tensor<?xi1>
-      return
+      util.return
     }
   }
 }
@@ -24,7 +24,7 @@
 stream.executable private @convert_store_i1 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       %c0 = arith.constant 0 : index
       // CHECK-DAG: %[[TILE_I8:.+]] = arith.constant dense<[0, 0, 1, 1]> : tensor<4xi8>
       // CHECK-DAG: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<writeonly:tensor<4xi8>>
@@ -32,7 +32,7 @@
       %cst = arith.constant dense<[false, false, true, true]> : tensor<4xi1>
       // CHECK-NEXT: flow.dispatch.tensor.store %[[TILE_I8]], %[[BINDING]], {{.+}} : tensor<4xi8> -> !flow.dispatch.tensor<writeonly:tensor<4xi8>>
       flow.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [4], strides = [1] : tensor<4xi1> -> !flow.dispatch.tensor<writeonly:tensor<4xi1>>
-      return
+      util.return
     }
   }
 }
@@ -43,7 +43,7 @@
 stream.executable private @convert_multi_i1 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding, %arg1: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding, %arg1: !stream.binding) {
       %c0 = arith.constant 0 : index
       %c4 = arith.constant 4 : index
       // CHECK-DAG: %[[BINDING0:.+]] = stream.binding.subspan %arg0{{.+}} -> !flow.dispatch.tensor<readonly:tensor<4xi8>>
@@ -61,7 +61,7 @@
       // CHECK: %[[RESULT_I8:.+]] = arith.extui %[[RESULT_I1]] : tensor<?xi1> to tensor<?xi8>
       // CHECK-NEXT: flow.dispatch.tensor.store %[[RESULT_I8]], %[[BINDING1]], {{.+}} : tensor<?xi8> -> !flow.dispatch.tensor<readwrite:tensor<4xi8>>
       flow.dispatch.tensor.store %result, %binding1, offsets = [0], sizes = [%c4], strides = [1] : tensor<?xi1> -> !flow.dispatch.tensor<readwrite:tensor<4xi1>>
-      return
+      util.return
     }
   }
 }
@@ -72,7 +72,7 @@
 stream.executable private @convert_load_i33 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       %c0 = arith.constant 0 : index
       // CHECK: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<readonly:tensor<4xi64>>
       %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xi33>>
@@ -81,7 +81,7 @@
       %tile = flow.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xi33>> -> tensor<?xi33>
       // CHECK: util.optimization_barrier %[[TILE_I1]]
       util.optimization_barrier %tile : tensor<?xi33>
-      return
+      util.return
     }
   }
 }
@@ -93,7 +93,7 @@
 stream.executable private @convert_store_i33 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       // CHECK: %[[CST:.+]] = arith.constant dense<[0, 7, 2, 5]> : tensor<4xi64>
       %c0 = arith.constant 0 : index
       // CHECK: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<writeonly:tensor<4xi64>>
@@ -101,7 +101,7 @@
       %cst = arith.constant dense<[0, 7, 2, 5]> : tensor<4xi33>
       // CHECK: flow.dispatch.tensor.store %[[CST]], %[[BINDING]], {{.+}} : tensor<4xi64> -> !flow.dispatch.tensor<writeonly:tensor<4xi64>>
       flow.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [4], strides = [1] : tensor<4xi33> -> !flow.dispatch.tensor<writeonly:tensor<4xi33>>
-      return
+      util.return
     }
   }
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_packing.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_packing.mlir
index d5a2856..886abca 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_packing.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_packing.mlir

@@ -7,7 +7,7 @@
 stream.executable private @subspanLoadI3 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       %c0 = arith.constant 0 : index
       // CHECK: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<readonly:tensor<4xi8>>
       %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xi3>>
@@ -16,7 +16,7 @@
       %tile = flow.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xi3>> -> tensor<?xi3>
       // CHECK: util.optimization_barrier %[[TILE_I3]] : tensor<?xi3>
       util.optimization_barrier %tile : tensor<?xi3>
-      return
+      util.return
     }
   }
 }
@@ -30,7 +30,7 @@
 stream.executable private @subspanStoreI3 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       // CHECK: %[[CST:.+]] = arith.constant dense<[0, 7, 2, 5]> : tensor<4xi8>
       %c0 = arith.constant 0 : index
       // CHECK: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<writeonly:tensor<4xi8>>
@@ -38,7 +38,7 @@
       %cst = arith.constant dense<[0, 7, 2, 5]> : tensor<4xi3>
       // CHECK: flow.dispatch.tensor.store %[[CST]], %[[BINDING]], {{.+}} : tensor<4xi8> -> !flow.dispatch.tensor<writeonly:tensor<4xi8>>
       flow.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [4], strides = [1] : tensor<4xi3> -> !flow.dispatch.tensor<writeonly:tensor<4xi3>>
-      return
+      util.return
     }
   }
 }
@@ -49,7 +49,7 @@
 stream.executable private @subspanLoadI4 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       %c0 = arith.constant 0 : index
       // CHECK: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<readonly:tensor<8xi4>>
       %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<8xi4>>
@@ -57,7 +57,7 @@
       %tile = flow.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi4>> -> tensor<?xi4>
       // CHECK: util.optimization_barrier %[[TILE_I4]]
       util.optimization_barrier %tile : tensor<?xi4>
-      return
+      util.return
     }
   }
 }
@@ -68,7 +68,7 @@
 stream.executable private @subspanStoreI4 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       %c0 = arith.constant 0 : index
       // CHECK: %[[TILE_I4:.+]] = arith.constant dense<[5, -1, 0, 3, 1, 7, -8, 4]> : tensor<8xi4>
       %cst = arith.constant dense<[5, 15, 0, 3, 1, 7, 8, 4]> : tensor<8xi4>
@@ -76,7 +76,7 @@
       %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<8xi4>>
       // CHECK: flow.dispatch.tensor.store %[[TILE_I4]], %[[BINDING]], offsets = [0], sizes = [8], strides = [1] : tensor<8xi4> -> !flow.dispatch.tensor<writeonly:tensor<8xi4>>
       flow.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [8], strides = [1] : tensor<8xi4> -> !flow.dispatch.tensor<writeonly:tensor<8xi4>>
-      return
+      util.return
     }
   }
 }
@@ -87,7 +87,7 @@
 stream.executable private @subspanLoadI8 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       %c0 = arith.constant 0 : index
       // CHECK: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<readonly:tensor<4xi8>>
       %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<4xi8>>
@@ -95,7 +95,7 @@
       %tile = flow.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xi8>> -> tensor<?xi8>
       // CHECK: util.optimization_barrier %[[TILE_I8]]
       util.optimization_barrier %tile : tensor<?xi8>
-      return
+      util.return
     }
   }
 }
@@ -106,7 +106,7 @@
 stream.executable private @subspanStoreI8 {
   stream.executable.export public @dispatch
   builtin.module {
-    func.func @dispatch(%arg0: !stream.binding) {
+     util.func public @dispatch(%arg0: !stream.binding) {
       %c0 = arith.constant 0 : index
       // CHECK-DAG: %[[TILE_I8:.+]] = arith.constant dense<[25, 8, 0, -1]> : tensor<4xi8>
       // CHECK-DAG: %[[BINDING:.+]] = stream.binding.subspan {{.+}} -> !flow.dispatch.tensor<writeonly:tensor<4xi8>>
@@ -114,7 +114,7 @@
       %cst = arith.constant dense<[25, 8, 0, 255]> : tensor<4xi8>
       // CHECK-NEXT: flow.dispatch.tensor.store %[[TILE_I8]], %[[BINDING]], {{.+}} : tensor<4xi8> -> !flow.dispatch.tensor<writeonly:tensor<4xi8>>
       flow.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [4], strides = [1] : tensor<4xi8> -> !flow.dispatch.tensor<writeonly:tensor<4xi8>>
-      return
+      util.return
     }
   }
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir
index b34db57..cb1aca7 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir

@@ -1,44 +1,44 @@
 // RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors %s | FileCheck %s
 
 // CHECK-LABEL: @denseTensorSizeOf
-func.func @denseTensorSizeOf(%arg0: index) -> index {
+util.func public @denseTensorSizeOf(%arg0: index) -> index {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 20 : index
   // CHECK: %[[DYNAMIC_SIZE:.+]] = arith.muli %arg0, %[[STATIC_SIZE]] : index
   %0 = stream.tensor.sizeof tensor<?x5xf32>{%arg0} : index
-  // CHECK: return %[[DYNAMIC_SIZE]]
-  return %0 : index
+  // CHECK: util.return %[[DYNAMIC_SIZE]]
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSizeOfEmpty
-func.func @denseTensorSizeOfEmpty(%arg0: index) -> index {
+util.func public @denseTensorSizeOfEmpty(%arg0: index) -> index {
   // CHECK: %[[ZERO:.+]] = arith.constant 0 : index
   %0 = stream.tensor.sizeof tensor<?x0xf32>{%arg0} : index
-  // CHECK: return %[[ZERO]]
-  return %0 : index
+  // CHECK: util.return %[[ZERO]]
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorEmpty
-func.func @denseTensorEmpty(%arg0: index, %arg1: index) -> !stream.resource<*> {
+util.func public @denseTensorEmpty(%arg0: index, %arg1: index) -> !stream.resource<*> {
   // CHECK: %[[RET:.+]] = stream.async.alloca : !stream.resource<*>{%arg1}
   %0 = stream.tensor.empty : tensor<?x1xf32>{%arg0} in !stream.resource<*>{%arg1}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorConstant
-func.func @denseTensorConstant(%arg0: index) -> !stream.resource<constant> {
+util.func public @denseTensorConstant(%arg0: index) -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 1280 : index
   // CHECK: %[[DYNAMIC_SIZE:.+]] = arith.muli %arg0, %[[STATIC_SIZE]] : index
   // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[DYNAMIC_SIZE]]} = dense<0.000000e+00> : tensor<1x5x64xf32>
   %0 = stream.tensor.constant : tensor<?x5x64xf32>{%arg0} in !stream.resource<constant> = dense<0.000000e+00> : tensor<1x5x64xf32>
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<constant>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<constant>
 }
 
 // -----
@@ -46,91 +46,91 @@
 // Tests that sub-byte element width constants get extended to byte alignment.
 
 // CHECK-LABEL: @denseTensorConstantI1
-func.func @denseTensorConstantI1() -> !stream.resource<constant> {
+util.func public @denseTensorConstantI1() -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 4 : index
   // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} = dense<[1, 1, 0, 1]> : tensor<4xi8>
   %0 = stream.tensor.constant : tensor<4xi1> in !stream.resource<constant> = dense<[true, true, false, true]> : tensor<4xi1>
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<constant>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatI32
-func.func @denseTensorSplatI32(%arg0: i32, %arg1: index, %arg2: index) -> !stream.resource<*> {
+util.func public @denseTensorSplatI32(%arg0: i32, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[RET:.+]] = stream.async.splat %arg0 : i32 -> !stream.resource<*>{%arg2}
   %0 = stream.tensor.splat %arg0 : i32 -> tensor<?x1x10xi32>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatI1
-func.func @denseTensorSplatI1(%arg0: i1, %arg1: index, %arg2: index) -> !stream.resource<*> {
+util.func public @denseTensorSplatI1(%arg0: i1, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[PATTERN:.+]] = arith.extui %arg0 : i1 to i8
   // CHECK: %[[RET:.+]] = stream.async.splat %[[PATTERN]] : i8 -> !stream.resource<*>{%arg2}
   %0 = stream.tensor.splat %arg0 : i1 -> tensor<?x1x10xi1>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatBF16
-func.func @denseTensorSplatBF16(%arg0: bf16, %arg1: index, %arg2: index) -> !stream.resource<*> {
+util.func public @denseTensorSplatBF16(%arg0: bf16, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[PATTERN:.+]] = arith.bitcast %arg0 : bf16 to i16
   // CHECK: %[[RET:.+]] = stream.async.splat %[[PATTERN]] : i16 -> !stream.resource<*>{%arg2}
   %0 = stream.tensor.splat %arg0 : bf16 -> tensor<?x1x10xbf16>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatF32
-func.func @denseTensorSplatF32(%arg0: f32, %arg1: index, %arg2: index) -> !stream.resource<*> {
+util.func public @denseTensorSplatF32(%arg0: f32, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[PATTERN:.+]] = arith.bitcast %arg0 : f32 to i32
   // CHECK: %[[RET:.+]] = stream.async.splat %[[PATTERN]] : i32 -> !stream.resource<*>{%arg2}
   %0 = stream.tensor.splat %arg0 : f32 -> tensor<?x1x10xf32>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatI64
-func.func @denseTensorSplatI64(%arg0: i64, %arg1: index, %arg2: index) -> !stream.resource<*> {
+util.func public @denseTensorSplatI64(%arg0: i64, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[RET:.+]] = stream.async.splat %arg0 : i64 -> !stream.resource<*>{%arg2}
   %0 = stream.tensor.splat %arg0 : i64 -> tensor<?x1x10xi64>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatConstantComplexF32
-func.func @denseTensorSplatConstantComplexF32(%arg0: !stream.resource<*>) -> (!stream.resource<*>) {
+util.func public @denseTensorSplatConstantComplexF32(%arg0: !stream.resource<*>) -> (!stream.resource<*>) {
   %cst = complex.constant [3.000000e+00 : f32, 1.000000e+01 : f32] : complex<f32>
   %0 = stream.tensor.sizeof tensor<6xcomplex<f32>> : index
   // CHECK: %[[I64NUMBER:.+]] = complex.constant [3.000000e+00 : f32, 1.000000e+01 : f32] : complex<f32>
   // CHECK: %[[BITCAST:.+]] = complex.bitcast %[[I64NUMBER]] : complex<f32> to i64
   // CHECK: %[[SPLAT_RES:.+]] = stream.async.splat %[[BITCAST]]
   %1 = stream.tensor.splat %cst : complex<f32> -> tensor<6xcomplex<f32>> in !stream.resource<*>{%0}
-  // CHECK: return %[[SPLAT_RES]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[SPLAT_RES]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatDynamicComplexF32
-func.func @denseTensorSplatDynamicComplexF32(%arg0: !stream.resource<*>, %arg1: complex<f32>) -> (!stream.resource<*>) {
+util.func public @denseTensorSplatDynamicComplexF32(%arg0: !stream.resource<*>, %arg1: complex<f32>) -> (!stream.resource<*>) {
   %0 = stream.tensor.sizeof tensor<6xcomplex<f32>> : index
   // CHECK: %[[BITCAST:.+]] = complex.bitcast %arg1 : complex<f32> to i64
   // CHECK: %[[SPLAT_RES:.+]] = stream.async.splat %[[BITCAST]]
   %1 = stream.tensor.splat %arg1 : complex<f32> -> tensor<6xcomplex<f32>> in !stream.resource<*>{%0}
-  // CHECK: return %[[SPLAT_RES]]
-  return %1 : !stream.resource<*>
+  // CHECK: util.return %[[SPLAT_RES]]
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
@@ -138,33 +138,33 @@
 // NOTE: clone likes to fold; the fills ensure it doesn't.
 
 // CHECK-LABEL: @denseTensorClone
-func.func @denseTensorClone(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: f32) -> (!stream.resource<*>, !stream.resource<*>) {
+util.func public @denseTensorClone(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: f32) -> (!stream.resource<*>, !stream.resource<*>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[RET:.+]] = stream.async.clone %arg0 : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg2}
   %0 = stream.tensor.clone %arg0 : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2}
   %1 = stream.tensor.fill %arg3, %0[%c0, %c0 for %c1, %c1] : f32 -> tensor<?x4xf32>{%arg1} in %0 as !stream.resource<*>{%arg2}
-  return %0, %1 : !stream.resource<*>, !stream.resource<*>
+  util.return %0, %1 : !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSlice
-func.func @denseTensorSlice(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> !stream.resource<*> {
+util.func public @denseTensorSlice(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 4 : index
   // CHECK: %[[END:.+]] = arith.addi %arg4, %[[OFFSET]] : index
   // CHECK: %[[RET:.+]] = stream.async.slice %arg0[%[[OFFSET]] to %[[END]]] : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg4}
   %0 = stream.tensor.slice %arg0[%c0, %c1 for %arg3, %c1] : tensor<?x4xf32>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x1xf32>{%arg3} in !stream.resource<*>{%arg4}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorFillF32
-func.func @denseTensorFillF32(%arg0: f32, %arg1: !stream.resource<*>, %arg2: index, %arg3: index) -> !stream.resource<*> {
+util.func public @denseTensorFillF32(%arg0: f32, %arg1: !stream.resource<*>, %arg2: index, %arg3: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 0 : index
@@ -172,28 +172,28 @@
   // CHECK-DAG: %[[PATTERN:.+]] = arith.bitcast %arg0 : f32 to i32
   // CHECK: %[[RET:.+]] = stream.async.fill %[[PATTERN]], %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i32 -> %arg1 as !stream.resource<*>{%arg3}
   %0 = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : f32 -> tensor<?x4xf32>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorFillI64
-func.func @denseTensorFillI64(%arg0: i64, %arg1: !stream.resource<*>, %arg2: index, %arg3: index) -> !stream.resource<*> {
+util.func public @denseTensorFillI64(%arg0: i64, %arg1: !stream.resource<*>, %arg2: index, %arg3: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 40 : index
   // CHECK: %[[RET:.+]] = stream.async.fill %arg0, %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i64 -> %arg1 as !stream.resource<*>{%arg3}
   %0 = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : i64 -> tensor<?x4xi64>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorFillF64
-func.func @denseTensorFillF64(%arg0: f64, %arg1: !stream.resource<*>, %arg2: index, %arg3: index) -> !stream.resource<*> {
+util.func public @denseTensorFillF64(%arg0: f64, %arg1: !stream.resource<*>, %arg2: index, %arg3: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 0 : index
@@ -201,67 +201,67 @@
   // CHECK-DAG: %[[PATTERN:.+]] = arith.bitcast %arg0 : f64 to i64
   // CHECK: %[[RET:.+]] = stream.async.fill %[[PATTERN]], %arg1[%[[OFFSET]] to %[[LENGTH]] for %[[LENGTH]]] : i64 -> %arg1 as !stream.resource<*>{%arg3}
   %0 = stream.tensor.fill %arg0, %arg1[%c0, %c0 for %c1, %c1] : f64 -> tensor<?x4xi64>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorUpdate
-func.func @denseTensorUpdate(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index) -> !stream.resource<*> {
+util.func public @denseTensorUpdate(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK: %[[RET:.+]] = stream.async.update %arg0, %arg2[%[[OFFSET]] to %arg1] : !stream.resource<*>{%arg1} -> %arg2 as !stream.resource<*>{%arg4}
   %0 = stream.tensor.update %arg0, %arg2[%c0, %c0] : tensor<2x2xf32> in !stream.resource<*>{%arg1} -> tensor<?x4xf32>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorLoad
-func.func @denseTensorLoad(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index) -> f32 {
+util.func public @denseTensorLoad(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK: %[[RET:.+]] = stream.async.load %arg0[%[[OFFSET]]] : !stream.resource<staging>{%arg2} -> f32
   %0 = stream.tensor.load %arg0[%c0] : tensor<?xf32>{%arg1} in !stream.resource<staging>{%arg2} -> f32
-  // CHECK: return %[[RET]]
-  return %0 : f32
+  // CHECK: util.return %[[RET]]
+  util.return %0 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorLoadRank0
-func.func @denseTensorLoadRank0(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
+util.func public @denseTensorLoadRank0(%arg0: !stream.resource<staging>, %arg1: index) -> f32 {
   %c0 = arith.constant 0 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK: %[[RET:.+]] = stream.async.load %arg0[%[[OFFSET]]] : !stream.resource<staging>{%arg1} -> f32
   %0 = stream.tensor.load %arg0 : tensor<f32> in !stream.resource<staging>{%arg1} -> f32
-  // CHECK: return %[[RET]]
-  return %0 : f32
+  // CHECK: util.return %[[RET]]
+  util.return %0 : f32
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorStore
-func.func @denseTensorStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: f32) -> !stream.resource<staging> {
+util.func public @denseTensorStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: f32) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK: %[[RET:.+]] = stream.async.store %arg3, %arg0[%[[OFFSET]]] : f32 -> %arg0 as !stream.resource<staging>{%arg2}
   %0 = stream.tensor.store %arg3, %arg0[%c0] : f32 -> tensor<?xf32>{%arg1} in %arg0 as !stream.resource<staging>{%arg2}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<staging>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<staging>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorStoreRank0
-func.func @denseTensorStoreRank0(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
+util.func public @denseTensorStoreRank0(%arg0: !stream.resource<staging>, %arg1: index, %arg2: f32) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[OFFSET:.+]] = arith.constant 0 : index
   // CHECK: %[[RET:.+]] = stream.async.store %arg2, %arg0[%[[OFFSET]]] : f32 -> %arg0 as !stream.resource<staging>{%arg1}
   %0 = stream.tensor.store %arg2, %arg0 : f32 -> tensor<f32> in %arg0 as !stream.resource<staging>{%arg1}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<staging>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<staging>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
index 824d0a7..bc58632 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir

@@ -1,15 +1,15 @@
 // RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --verify-diagnostics %s | FileCheck %s
 
-// CHECK-LABEL: func.func @denseTensorConstantI2()
-func.func @denseTensorConstantI2() -> !stream.resource<constant> {
+// CHECK-LABEL:  util.func public @denseTensorConstantI2()
+util.func public @denseTensorConstantI2() -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 4 : index
   // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} =
   // CHECK-SAME: dense<[0, 1, -2, -1, 0, 1, -2, -1, 0, 1, -2, -1, 0, 1, -2, -1]> : tensor<16xi2>
   %0 = stream.tensor.constant : tensor<16xi2> in !stream.resource<constant> = dense<[
     0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
   ]> : tensor<16xi2>
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<constant>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<constant>
 }
 
 // -----
@@ -17,78 +17,78 @@
 // Ensures that a non-power-of-two type (i3) constant is expanded to a full byte
 // because we don't currently do unaligned sub-byte packing.
 
-// CHECK: func.func @denseTensorConstantI3()
-func.func @denseTensorConstantI3() -> !stream.resource<constant> {
+// CHECK:  util.func public @denseTensorConstantI3()
+util.func public @denseTensorConstantI3() -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 4 : index
   // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} = dense<[0, 7, 2, 5]> : tensor<4xi8>
   %0 = stream.tensor.constant : tensor<4xi3> in !stream.resource<constant> = dense<[0, 7, 2, 5]> : tensor<4xi3>
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<constant>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorConstantI4
-func.func @denseTensorConstantI4() -> !stream.resource<constant> {
+util.func public @denseTensorConstantI4() -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 4 : index
   // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} = dense<[5, -1, 0, 3, 1, 7, -8, 4]> : tensor<8xi4>
   %0 = stream.tensor.constant : tensor<8xi4> in !stream.resource<constant> = dense<[5, 15, 0, 3, 1, 7, 8, 4]> : tensor<8xi4>
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<constant>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<constant>
 }
 
 // -----
 
 // Checks that non-byte-aligned total size is not supported for constant.
 
-func.func @denseTensorConstantI4() -> !stream.resource<constant> {
+util.func public @denseTensorConstantI4() -> !stream.resource<constant> {
   // expected-error @+1 {{failed to calculate total byte count: 'tensor<5xi4>' does not have integral number of total bytes}}
   %0 = stream.tensor.constant : tensor<5xi4> in !stream.resource<constant> = dense<[5, 15, 0, 3, 1]> : tensor<5xi4>
-  return %0 : !stream.resource<constant>
+  util.return %0 : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorConstantI8
-func.func @denseTensorConstantI8() -> !stream.resource<constant> {
+util.func public @denseTensorConstantI8() -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 8 : index
   // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} = dense<[5, 15, 0, 3, 1, 7, 8, 4]> : tensor<8xi8>
   %0 = stream.tensor.constant : tensor<8xi8> in !stream.resource<constant> = dense<[5, 15, 0, 3, 1, 7, 8, 4]> : tensor<8xi8>
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<constant>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSizeOfStatic
-func.func @denseTensorSizeOfStatic() -> index {
+util.func public @denseTensorSizeOfStatic() -> index {
   // CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index
   %0 = stream.tensor.sizeof tensor<12xi4> : index
-  // CHECK: return %[[C6]]
-  return %0 : index
+  // CHECK: util.return %[[C6]]
+  util.return %0 : index
 }
 
 // -----
 
 // Checks that non-byte-aligned total size is not supported for sizeof.
 
-func.func @denseTensorSizeOfStatic() -> index {
+util.func public @denseTensorSizeOfStatic() -> index {
   // expected-error @+1 {{failed to calculate total byte count: 'tensor<11xi4>' does not have integral number of total bytes}}
   %0 = stream.tensor.sizeof tensor<11xi4> : index
-  return %0 : index
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSizeOfDynamic
-func.func @denseTensorSizeOfDynamic(%arg0: index) -> index {
+util.func public @denseTensorSizeOfDynamic(%arg0: index) -> index {
   // CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
   // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
   // CHECK: %[[MUL:.+]] = arith.muli %arg0, %[[C5]] : index
   // CHECK: %[[DIV:.+]] = arith.divui %[[MUL]], %[[C2]] : index
   %0 = stream.tensor.sizeof tensor<?x5xi4>{%arg0} : index
-  // CHECK: return %[[DIV]]
-  return %0 : index
+  // CHECK: util.return %[[DIV]]
+  util.return %0 : index
 }
 
 // -----
@@ -96,11 +96,11 @@
 // Checks that stream.tensor.load with sub-byte packing is not supported right now.
 
 // CHECK-LABEL: @denseTensorLoad
-func.func @denseTensorLoad(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: index) -> i4 {
+util.func public @denseTensorLoad(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: index) -> i4 {
   %c0 = arith.constant 0 : index
   // CHECK: stream.tensor.load
   %0 = stream.tensor.load %arg0[%arg3] : tensor<?xi4>{%arg1} in !stream.resource<staging>{%arg2} -> i4
-  return %0 : i4
+  util.return %0 : i4
 }
 
 // -----
@@ -108,17 +108,17 @@
 // Checks that stream.tensor.store with sub-byte packing is not supported right now.
 
 // CHECK-LABEL: @denseTensorStore
-func.func @denseTensorStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: i4) -> !stream.resource<staging> {
+util.func public @denseTensorStore(%arg0: !stream.resource<staging>, %arg1: index, %arg2: index, %arg3: i4) -> !stream.resource<staging> {
   %c0 = arith.constant 0 : index
   // CHECK: stream.tensor.store
   %0 = stream.tensor.store %arg3, %arg0[%c0] : i4 -> tensor<?xi4>{%arg1} in %arg0 as !stream.resource<staging>{%arg2}
-  return %0 : !stream.resource<staging>
+  util.return %0 : !stream.resource<staging>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSplatI2
-func.func @denseTensorSplatI2(%arg0: i2, %arg1: index, %arg2: index) -> !stream.resource<*> {
+util.func public @denseTensorSplatI2(%arg0: i2, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[C2:.+]] = arith.constant 2 : i8
   // CHECK: %[[PART:.+]] = arith.extui %arg0 : i2 to i8
   // CHECK: %[[SHL0:.+]] = arith.shli %[[PART]], %[[C2]] : i8
@@ -129,14 +129,14 @@
   // CHECK: %[[FULL:.+]] = arith.ori %[[SH2]], %[[PART]] : i8
   // CHECK: %[[SPLAT:.+]] = stream.async.splat %[[FULL]] : i8 -> !stream.resource<*>{%arg2}
   %0 = stream.tensor.splat %arg0 : i2 -> tensor<?x1x16xi2>{%arg1} in !stream.resource<*>{%arg2}
-  // CHECK: return %[[SPLAT]] : !stream.resource<*>
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[SPLAT]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorFillI4
-func.func @denseTensorFillI4(%arg0: i4, %arg1: !stream.resource<*>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) -> !stream.resource<*> {
+util.func public @denseTensorFillI4(%arg0: i4, %arg1: !stream.resource<*>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) -> !stream.resource<*> {
   // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
   // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : i8
   // CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
@@ -152,14 +152,14 @@
   // CHECK: %[[END:.+]] = arith.addi %[[OFFSET]], %[[LEN]] : index
   // CHECK: %[[FILL:.+]] = stream.async.fill %[[FULL]], %arg1[%[[OFFSET]] to %[[END]] for %[[LEN]]] : i8 -> %arg1 as !stream.resource<*>{%arg3}
   %0 = stream.tensor.fill %arg0, %arg1[%arg4, %arg5 for %arg6, %arg7] : i4 -> tensor<?x16xi4>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
-  // CHECK: return %[[FILL]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[FILL]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorSliceI2
-func.func @denseTensorSliceI2(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
+util.func public @denseTensorSliceI2(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
   %c2 = arith.constant 2 : index
   // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
   // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
@@ -169,8 +169,8 @@
   // CHECK: %[[LEN:.+]] = arith.addi %[[OFFSET]], %arg4 : index
   // CHECK: %[[SLICE:.+]] = stream.async.slice %arg0[%[[OFFSET]] to %[[LEN]]] : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg4}
   %0 = stream.tensor.slice %arg0[%arg5, %arg6 for %arg3, %c2] : tensor<?x8xi2>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x2xi2>{%arg3} in !stream.resource<*>{%arg4}
-  // CHECK: return %[[SLICE]] : !stream.resource<*>
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[SLICE]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -179,7 +179,7 @@
 // because we don't currently do unaligned sub-byte packing.
 
 // CHECK-LABEL: @denseTensorSliceI3
-func.func @denseTensorSliceI3(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
+util.func public @denseTensorSliceI3(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
   %c2 = arith.constant 2 : index
   // CHECK: %[[C8:.+]] = arith.constant 8 : index
   // CHECK: %[[MUL:.+]] = arith.muli %arg5, %[[C8]] : index
@@ -187,8 +187,8 @@
   // CHECK: %[[LEN:.+]] = arith.addi %[[OFFSET]], %arg4 : index
   // CHECK: %[[SLICE:.+]] = stream.async.slice %arg0[%[[OFFSET]] to %[[LEN]]] : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg4}
   %0 = stream.tensor.slice %arg0[%arg5, %arg6 for %arg3, %c2] : tensor<?x8xi3>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x2xi3>{%arg3} in !stream.resource<*>{%arg4}
-  // CHECK: return %[[SLICE]] : !stream.resource<*>
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[SLICE]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -197,7 +197,7 @@
 // because we don't currently do unaligned sub-byte packing.
 
 // CHECK-LABEL: @denseTensorUpdateI3
-func.func @denseTensorUpdateI3(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
+util.func public @denseTensorUpdateI3(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
   // CHECK: %[[C4:.+]] = arith.constant 4 : index
   // CHECK: %[[MUL:.+]] = arith.muli %arg5, %[[C4]] : index
   // CHECK: %[[OFFSET:.+]] = arith.addi %[[MUL]], %arg6 : index
@@ -205,14 +205,14 @@
   // CHECK: %[[UPDATE:.+]] = stream.async.update %arg0, %arg2[%[[OFFSET]] to %[[LEN]]] : !stream.resource<*>{%arg1} -> %arg2 as !stream.resource<*>{%arg4}
 
   %0 = stream.tensor.update %arg0, %arg2[%arg5, %arg6] : tensor<8x4xi3> in !stream.resource<*>{%arg1} -> tensor<?x4xi3>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
-  // CHECK: return %[[UPDATE]] : !stream.resource<*>
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[UPDATE]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // CHECK-LABEL: @denseTensorUpdateI4
-func.func @denseTensorUpdateI4(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
+util.func public @denseTensorUpdateI4(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
   // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
   // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
   // CHECK: %[[MUL:.+]] = arith.muli %arg5, %[[C4]] : index
@@ -221,6 +221,6 @@
   // CHECK: %[[LEN:.+]] = arith.addi %[[OFFSET]], %arg1 : index
   // CHECK: %[[UPDATE:.+]] = stream.async.update %arg0, %arg2[%[[OFFSET]] to %[[LEN]]] : !stream.resource<*>{%arg1} -> %arg2 as !stream.resource<*>{%arg4}
   %0 = stream.tensor.update %arg0, %arg2[%arg5, %arg6] : tensor<8x4xi4> in !stream.resource<*>{%arg1} -> tensor<?x4xi4>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
-  // CHECK: return %[[UPDATE]] : !stream.resource<*>
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[UPDATE]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fold_globals.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fold_globals.mlir
index f817857..4c9ce60 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fold_globals.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fold_globals.mlir

@@ -6,27 +6,27 @@
 
 // CHECK: util.global public mutable @uniformConstants = #stream.timepoint<immediate>
 util.global public mutable @uniformConstants : !stream.timepoint
-func.func @foo() {
+util.func public @foo() {
   %timepoint = stream.timepoint.immediate => !stream.timepoint
   // CHECK-NOT: util.global.store
   util.global.store %timepoint, @uniformConstants : !stream.timepoint
-  return
+  util.return
 }
-func.func @bar() {
+util.func public @bar() {
   %timepoint = stream.timepoint.immediate => !stream.timepoint
   // CHECK-NOT: util.global.store
   util.global.store %timepoint, @uniformConstants : !stream.timepoint
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-NOT: @immutable
 util.global private @immutable = #stream.timepoint<immediate> : !stream.timepoint
-func.func @foo() -> !stream.timepoint {
+util.func public @foo() -> !stream.timepoint {
   // CHECK-NOT: util.global.load @immutable
   // CHECK: %[[IMMEDIATE:.+]] = stream.timepoint.immediate => !stream.timepoint
   %0 = util.global.load @immutable : !stream.timepoint
-  // CHECK: return %[[IMMEDIATE]]
-  return %0 : !stream.timepoint
+  // CHECK: util.return %[[IMMEDIATE]]
+  util.return %0 : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fold_uniform_operands.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fold_uniform_operands.mlir
index 985f28e..f37b7a9 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fold_uniform_operands.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fold_uniform_operands.mlir

@@ -11,8 +11,8 @@
 stream.executable private @deduplicateOperandsEx {
   stream.executable.export public @dispatch
   builtin.module  {
-    // CHECK: func.func @dispatch(%[[BINDING:.+]]: !stream.binding, %[[A01:.+]]: i32, %[[B0:.+]]: index, %[[C:.+]]: i1, %[[B1:.+]]: index)
-    func.func @dispatch(%binding: !stream.binding, %a0: i32, %b0: index, %c: i1, %a1: i32, %b1: index) {
+    // CHECK:  util.func public @dispatch(%[[BINDING:.+]]: !stream.binding, %[[A01:.+]]: i32, %[[B0:.+]]: index, %[[C:.+]]: i1, %[[B1:.+]]: index)
+     util.func public @dispatch(%binding: !stream.binding, %a0: i32, %b0: index, %c: i1, %a1: i32, %b1: index) {
       // CHECK-NEXT: util.optimization_barrier %[[BINDING]] : !stream.binding
       util.optimization_barrier %binding : !stream.binding
       // CHECK-NEXT: util.optimization_barrier %[[A01]] : i32
@@ -25,12 +25,12 @@
       util.optimization_barrier %b1 : index
       // CHECK-NEXT: util.optimization_barrier %[[C]] : i1
       util.optimization_barrier %c : i1
-      return
+      util.return
     }
   }
 }
-// CHECK: func.func @deduplicateOperands(%[[A:.+]]: i32, %[[B:.+]]: index, %[[C:.+]]: i1)
-func.func @deduplicateOperands(%a: i32, %b: index, %c: i1) {
+// CHECK:  util.func public @deduplicateOperands(%[[A:.+]]: i32, %[[B:.+]]: index, %[[C:.+]]: i1)
+util.func public @deduplicateOperands(%a: i32, %b: index, %c: i1) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -45,7 +45,7 @@
       rw %capture[%c0 for %c20] : !stream.resource<transient>{%c20}
     }
   } => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
@@ -60,8 +60,8 @@
 stream.executable private @inlineConstantOperandsEx {
   stream.executable.export public @dispatch
   builtin.module  {
-    // CHECK: func.func @dispatch(%[[BINDING:.+]]: !stream.binding, %[[A:.+]]: i32, %[[C:.+]]: i1)
-    func.func @dispatch(%binding: !stream.binding, %a: i32, %b: index, %c: i1) {
+    // CHECK:  util.func public @dispatch(%[[BINDING:.+]]: !stream.binding, %[[A:.+]]: i32, %[[C:.+]]: i1)
+     util.func public @dispatch(%binding: !stream.binding, %a: i32, %b: index, %c: i1) {
       // CHECK: %[[B:.+]] = arith.constant 20 : index
       // CHECK-NEXT: util.optimization_barrier %[[BINDING]] : !stream.binding
       util.optimization_barrier %binding : !stream.binding
@@ -71,12 +71,12 @@
       util.optimization_barrier %b : index
       // CHECK-NEXT: util.optimization_barrier %[[C]] : i1
       util.optimization_barrier %c : i1
-      return
+      util.return
     }
   }
 }
-// CHECK: func.func @inlineConstantOperands(%[[A:.+]]: i32)
-func.func @inlineConstantOperands(%a: i32) {
+// CHECK:  util.func public @inlineConstantOperands(%[[A:.+]]: i32)
+util.func public @inlineConstantOperands(%a: i32) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -93,5 +93,5 @@
       rw %capture[%c0 for %c20] : !stream.resource<transient>{%c20}
     }
   } => !stream.timepoint
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings.mlir
index 6f0f7a6..14e8fb2 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings.mlir

@@ -16,9 +16,9 @@
 stream.executable private @rebaseBindingsEx {
   stream.executable.export public @dispatch attributes {stream.resources = #aliasConfig}
   builtin.module  {
-    // CHECK: func.func @dispatch(%[[BINDING_A:.+]]: !stream.binding, %[[BINDING_B:.+]]: !stream.binding,
+    // CHECK:  util.func public @dispatch(%[[BINDING_A:.+]]: !stream.binding, %[[BINDING_B:.+]]: !stream.binding,
     // CHECK-SAME:           %[[OFFSET_A:.+]]: index, %[[OFFSET_B:.+]]: index, %[[OPERAND:.+]]: index)
-    func.func @dispatch(%binding_a: !stream.binding, %binding_b: !stream.binding, %operand: index) {
+     util.func public @dispatch(%binding_a: !stream.binding, %binding_b: !stream.binding, %operand: index) {
       %c0 = arith.constant 0 : index
       %c20 = arith.constant 20 : index
 
@@ -35,12 +35,12 @@
 
       // CHECK-NEXT: util.optimization_barrier %[[OPERAND]] : index
       util.optimization_barrier %operand : index
-      return
+      util.return
     }
   }
 }
-// CHECK: func.func @rebaseBindings(%[[OPERAND:.+]]: index)
-func.func @rebaseBindings(%operand: index) {
+// CHECK:  util.func public @rebaseBindings(%[[OPERAND:.+]]: index)
+util.func public @rebaseBindings(%operand: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -74,7 +74,7 @@
       ro %capture1[%c160 for %c20] : !stream.resource<transient>{%c200}
     }
   } => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
@@ -97,9 +97,9 @@
 stream.executable private @deduplicateBindingsEx {
   stream.executable.export public @dispatch attributes {stream.resources = #aliasConfig}
   builtin.module  {
-    // CHECK: func.func @dispatch(%[[BINDING_A:.+]]: !stream.binding, %[[BINDING_B:.+]]: !stream.binding,
+    // CHECK:  util.func public @dispatch(%[[BINDING_A:.+]]: !stream.binding, %[[BINDING_B:.+]]: !stream.binding,
     // CHECK-SAME:           %[[OFFSET_A:.+]]: index, %[[OFFSET_C:.+]]: index, %[[OFFSET_B:.+]]: index, %[[OPERAND:.+]]: index)
-    func.func @dispatch(%binding_a: !stream.binding, %binding_b: !stream.binding, %binding_c: !stream.binding, %operand: index) {
+     util.func public @dispatch(%binding_a: !stream.binding, %binding_b: !stream.binding, %binding_c: !stream.binding, %operand: index) {
       %c0 = arith.constant 0 : index
       %c20 = arith.constant 20 : index
       %c40 = arith.constant 40 : index
@@ -123,12 +123,12 @@
 
       // CHECK-NEXT: util.optimization_barrier %[[OPERAND]] : index
       util.optimization_barrier %operand : index
-      return
+      util.return
     }
   }
 }
-// CHECK: func.func @deduplicateBindings(%[[OPERAND:.+]]: index)
-func.func @deduplicateBindings(%operand: index) {
+// CHECK:  util.func public @deduplicateBindings(%[[OPERAND:.+]]: index)
+util.func public @deduplicateBindings(%operand: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -166,5 +166,5 @@
       rw %capture0[%c20 for %c20] : !stream.resource<transient>{%c200}
     }
   } => !stream.timepoint
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings_noalias.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings_noalias.mlir
index 1736232..6b9696b 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings_noalias.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings_noalias.mlir

@@ -13,9 +13,9 @@
 stream.executable private @deduplicateBindingsEx {
   stream.executable.export public @dispatch attributes {stream.resources = #noaliasConfig}
   builtin.module  {
-    // CHECK: func.func @dispatch(%[[BINDING_A:.+]]: !stream.binding, %[[BINDING_C:.+]]: !stream.binding,
+    // CHECK:  util.func public @dispatch(%[[BINDING_A:.+]]: !stream.binding, %[[BINDING_C:.+]]: !stream.binding,
     // CHECK-SAME:           %[[OFFSET_A:.+]]: index, %[[OFFSET_B:.+]]: index, %[[OFFSET_C:.+]]: index, %[[OPERAND:.+]]: index)
-    func.func @dispatch(%binding_a: !stream.binding, %binding_b: !stream.binding, %binding_c: !stream.binding, %operand: index) {
+     util.func public @dispatch(%binding_a: !stream.binding, %binding_b: !stream.binding, %binding_c: !stream.binding, %operand: index) {
       %c0 = arith.constant 0 : index
       %c20 = arith.constant 20 : index
       %c40 = arith.constant 40 : index
@@ -39,12 +39,12 @@
 
       // CHECK-NEXT: util.optimization_barrier %[[OPERAND]] : index
       util.optimization_barrier %operand : index
-      return
+      util.return
     }
   }
 }
-// CHECK: func.func @deduplicateBindings(%[[OPERAND:.+]]: index)
-func.func @deduplicateBindings(%operand: index) {
+// CHECK:  util.func public @deduplicateBindings(%[[OPERAND:.+]]: index)
+util.func public @deduplicateBindings(%operand: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -78,5 +78,5 @@
       rw %capture0[%c20 for %c20] : !stream.resource<transient>{%c200}
     }
   } => !stream.timepoint
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/layout_slices.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/layout_slices.mlir
index 30cf36d..1bc6fc2 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/layout_slices.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/layout_slices.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-stream-layout-slices, cse))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module( util.func(iree-stream-layout-slices, cse))' %s | FileCheck %s
 
 #layoutStaticConfig = #stream.resource_config<{
   max_allocation_size = 1073741824,
@@ -9,7 +9,7 @@
 }>
 
 // CHECK-LABEL: @layoutStatic
-func.func @layoutStatic() -> (index, index, index, index, index, index, index)
+util.func public @layoutStatic() -> (index, index, index, index, index, index, index)
     attributes {stream.resources = #layoutStaticConfig} {
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -22,9 +22,9 @@
     [5, 8] = %c100,  // +208 (after 200 align 16)
   }) : index
   // 224 + 200 align 16 = 432 total bytes required
-  // CHECK: return %c432
+  // CHECK: util.return %c432
   // CHECK-SAME: %c0, %c112, %c0, %c224, %c0, %c208
-  return %t#0, %t#1, %t#2, %t#3, %t#4, %t#5, %t#6 : index, index, index, index, index, index, index
+  util.return %t#0, %t#1, %t#2, %t#3, %t#4, %t#5, %t#6 : index, index, index, index, index, index, index
 }
 
 // -----
@@ -39,7 +39,7 @@
 
 // CHECK-LABEL: @layoutDynamic
 // CHECK-SAME: (%[[SIZE_A:.+]]: index, %[[SIZE_B:.+]]: index)
-func.func @layoutDynamic(%size_a: index, %size_b: index) -> (index, index, index, index)
+util.func public @layoutDynamic(%size_a: index, %size_b: index) -> (index, index, index, index)
     attributes {stream.resources = #layoutDynamicConfig} {
   %t:4 = stream.resource.pack slices({
     [0, 1] = %size_a,
@@ -54,8 +54,8 @@
   // CHECK-DAG: %2 = util.align %[[SIZE_B]], %c16 : index
   // CHECK-DAG: %3 = arith.addi %1, %2 : index
 
-  // CHECK: return %3, %c0, %1, %c0
-  return %t#0, %t#1, %t#2, %t#3 : index, index, index, index
+  // CHECK: util.return %3, %c0, %1, %c0
+  util.return %t#0, %t#1, %t#2, %t#3 : index, index, index, index
 }
 
 // -----
@@ -70,7 +70,7 @@
 
 // CHECK-LABEL: @layoutMixedStaticDynamic
 // CHECK-SAME: (%[[SIZE_A:.+]]: index, %[[SIZE_B:.+]]: index)
-func.func @layoutMixedStaticDynamic(%size_a: index, %size_b: index) -> (index, index, index, index, index)
+util.func public @layoutMixedStaticDynamic(%size_a: index, %size_b: index) -> (index, index, index, index, index)
     attributes {stream.resources = #layoutMixedStaticDynamicConfig} {
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -89,6 +89,6 @@
   // CHECK-DAG: %2 = util.align %[[SIZE_B]], %c16 : index
   // CHECK-DAG: %3 = arith.addi %1, %2 : index
 
-  // CHECK: return %3, %c0, %c208, %1, %c0
-  return %t#0, %t#1, %t#2, %t#3, %t#4 : index, index, index, index, index
+  // CHECK: util.return %3, %c0, %c208, %1, %c0
+  util.return %t#0, %t#1, %t#2, %t#3, %t#4 : index, index, index, index, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/materialize_builtins.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/materialize_builtins.mlir
index 92a0627..bc50f4e 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/materialize_builtins.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/materialize_builtins.mlir

@@ -5,13 +5,13 @@
 // is set.
 
 // CHECK-LABEL: @splatI32
-func.func @splatI32(%arg0: index, %arg1: i32) -> !stream.resource<*> {
+util.func public @splatI32(%arg0: index, %arg1: i32) -> !stream.resource<*> {
   // NATIVE: %[[RET:.+]] = stream.async.splat %arg1
   // EMULATED: %[[COUNT:.+]] = arith.divui %arg0, %c4
   // EMULATED: %[[RET:.+]] = stream.async.dispatch @__builtin_splat_i32::@__builtin_splat_i32[%[[COUNT]]](%arg1, %[[COUNT]]) : (i32, index) -> !stream.resource<*>{%arg0}
   %0 = stream.async.splat %arg1 : i32 -> !stream.resource<*>{%arg0}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // NATIVE-NOT: stream.executable private @__builtin_splat_i32
@@ -22,12 +22,12 @@
 // Tests expansion of the stream.async.splat op for i64 types.
 
 // CHECK-LABEL: @builtinSplatI64
-func.func @builtinSplatI64(%arg0: index, %arg1: i64) -> !stream.resource<*> {
+util.func public @builtinSplatI64(%arg0: index, %arg1: i64) -> !stream.resource<*> {
   // CHECK: %[[COUNT:.+]] = arith.divui %arg0, %c8
   // CHECK: %[[RET:.+]] = stream.async.dispatch @__builtin_splat_i64::@__builtin_splat_i64[%[[COUNT]]](%arg1, %[[COUNT]]) : (i64, index) -> !stream.resource<*>{%arg0}
   %0 = stream.async.splat %arg1 : i64 -> !stream.resource<*>{%arg0}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // CHECK: stream.executable private @__builtin_splat_i64
@@ -38,12 +38,12 @@
 
 // CHECK-LABEL: @builtinFillI64
 // CHECK-SAME: (%[[RES:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index, %[[VALUE:.+]]: i64, %[[BYTE_OFFSET:.+]]: index, %[[BYTE_END:.+]]: index, %[[BYTE_LENGTH:.+]]: index)
-func.func @builtinFillI64(%res: !stream.resource<*>, %size: index, %value: i64, %byte_offset: index, %byte_end: index, %byte_length: index) -> !stream.resource<*> {
+util.func public @builtinFillI64(%res: !stream.resource<*>, %size: index, %value: i64, %byte_offset: index, %byte_end: index, %byte_length: index) -> !stream.resource<*> {
   // CHECK: %[[COUNT:.+]] = arith.divui %[[BYTE_LENGTH]], %c8
   // CHECK: %[[RET:.+]] = stream.async.dispatch @__builtin_fill_i64::@__builtin_fill_i64[%[[COUNT]]](%[[RES]][%[[BYTE_OFFSET]] to %[[BYTE_END]] for %[[BYTE_LENGTH]]], %[[VALUE]], %[[BYTE_OFFSET]], %[[COUNT]]) : (!stream.resource<*>{%[[SIZE]]}, i64, index, index) -> %[[RES]]{%[[SIZE]]}
   %0 = stream.async.fill %value, %res[%byte_offset to %byte_end for %byte_length] : i64 -> %arg0 as !stream.resource<*>{%size}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // CHECK: stream.executable private @__builtin_fill_i64
@@ -54,7 +54,7 @@
 // correct places.
 
 // CHECK-LABEL: @builtinSplatI64
-func.func @builtinSplatI64(%arg0: index, %arg1: i64) -> (!stream.resource<*>, !stream.timepoint) {
+util.func public @builtinSplatI64(%arg0: index, %arg1: i64) -> (!stream.resource<*>, !stream.timepoint) {
   // CHECK: %[[COUNT:.+]] = arith.divui %arg0, %c8
   // CHECK: = stream.async.execute
   %0:2 = stream.async.execute with() -> !stream.resource<*>{%arg0} {
@@ -67,7 +67,7 @@
     }
     stream.yield %1 : !stream.resource<*>{%arg0}
   } => !stream.timepoint
-  return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<*>, !stream.timepoint
 }
 
 // CHECK: stream.executable private @__builtin_splat_i64
@@ -87,14 +87,14 @@
 
 // CHECK: stream.executable private @__builtin_splat_i64
 
-// CHECK: func.func @otherUser
-func.func @otherUser() -> !stream.resource<*> {
+// CHECK:  util.func public @otherUser
+util.func public @otherUser() -> !stream.resource<*> {
   %c128 = arith.constant 128 : index
   %c1_i64 = arith.constant 1 : i64
   // CHECK: %[[RET:.+]] = stream.async.dispatch @__builtin_splat_i64::@__builtin_splat_i64
   %0 = stream.async.splat %c1_i64 : i64 -> !stream.resource<*>{%c128}
-  // CHECK: return %[[RET]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<*>
 }
 
 // CHECK-NOT: stream.executable private @__builtin_splat_i64

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/materialize_copy_on_write.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/materialize_copy_on_write.mlir
index 133eda0..9ca52ce 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/materialize_copy_on_write.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/materialize_copy_on_write.mlir

@@ -1,19 +1,19 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-stream-materialize-copy-on-write))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module( util.func(iree-stream-materialize-copy-on-write))' %s | FileCheck %s
 
 // Tests that block arguments (including function arguments) are always cloned.
 // Until a whole-program analysis runs we don't know their semantics.
 
 // CHECK-LABEL: @blockArgsNeedCopies
 //  CHECK-SAME: (%[[SRC:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index)
-func.func @blockArgsNeedCopies(%src: !stream.resource<*>, %size: index) -> !stream.resource<*> {
+util.func public @blockArgsNeedCopies(%src: !stream.resource<*>, %size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
   // CHECK: %[[CLONE:.+]] = stream.async.clone %[[SRC]] : !stream.resource<*>{%[[SIZE]]} -> !stream.resource<*>{%[[SIZE]]}
   // CHECK: %[[FILL:.+]] = stream.async.fill %c123_i32, %[[CLONE]]{{.+}} -> %[[CLONE]]
   %0 = stream.async.fill %c123_i32, %src[%c0 to %c128 for %c128] : i32 -> %src as !stream.resource<*>{%size}
-  // CHECK: return %[[FILL]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[FILL]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -22,7 +22,7 @@
 
 // CHECK-LABEL: @singleUseTiedOperand
 //  CHECK-SAME: (%[[SIZE:.+]]: index)
-func.func @singleUseTiedOperand(%size: index) -> !stream.resource<*> {
+util.func public @singleUseTiedOperand(%size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -37,7 +37,7 @@
   // CHECK-NOT: stream.async.clone
   // CHECK: stream.async.fill
   %2 = stream.async.fill %c789_i32, %1[%c128 to %c256 for %c128] : i32 -> %0 as !stream.resource<*>{%size}
-  return %2 : !stream.resource<*>
+  util.return %2 : !stream.resource<*>
 }
 
 // -----
@@ -46,7 +46,7 @@
 // user.
 
 // CHECK-LABEL: @multipleUsesOneUser
-func.func private @multipleUsesOneUser(%size: index) -> !stream.resource<*> {
+util.func private @multipleUsesOneUser(%size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -56,7 +56,7 @@
   // CHECK-NOT: stream.async.clone
   // CHECK: stream.async.dispatch
   %1 = stream.async.dispatch @ex::@dispatch(%0[%c0 to %c128 for %c128], %0[%c128 to %c256 for %c128]) : (!stream.resource<*>{%size}, !stream.resource<*>{%size}) -> %0{%size}
-  return %1 : !stream.resource<*>
+  util.return %1 : !stream.resource<*>
 }
 
 // -----
@@ -66,7 +66,7 @@
 
 // CHECK-LABEL: @oneCopyPerOperation
 //  CHECK-SAME: (%[[SRC:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index)
-func.func @oneCopyPerOperation(%src: !stream.resource<*>, %size: index) -> !stream.resource<*> {
+util.func public @oneCopyPerOperation(%src: !stream.resource<*>, %size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 128 : index
@@ -74,8 +74,8 @@
   // CHECK-NOT: stream.async.clone
   // CHECK: %[[RESULT:.+]] = stream.async.dispatch @ex::@dispatch(%[[CLONE]]{{.*}}, %[[CLONE]]{{.*}}) {{.*}} -> %[[CLONE]]{%[[SIZE]]}
   %0 = stream.async.dispatch @ex::@dispatch(%src[%c0 to %c128 for %c128], %src[%c128 to %c256 for %c128]) : (!stream.resource<*>{%size}, !stream.resource<*>{%size}) -> %src{%size}
-  // CHECK: return %[[RESULT]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RESULT]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -86,7 +86,7 @@
 
 // CHECK-LABEL: @multiUseTiedOperand
 //  CHECK-SAME: (%[[SIZE:.+]]: index)
-func.func @multiUseTiedOperand(%size: index) -> (!stream.resource<*>, !stream.resource<*>) {
+util.func public @multiUseTiedOperand(%size: index) -> (!stream.resource<*>, !stream.resource<*>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -101,7 +101,7 @@
   // CHECK: %[[CLONE1:.+]] = stream.async.clone %[[SPLAT]]
   // CHECK: %[[FILL1:.+]] = stream.async.fill %c789_i32, %[[CLONE1]]
   %2 = stream.async.fill %c789_i32, %0[%c128 to %c256 for %c128] : i32 -> %0 as !stream.resource<*>{%size}
-  return %1, %2 : !stream.resource<*>, !stream.resource<*>
+  util.return %1, %2 : !stream.resource<*>, !stream.resource<*>
 }
 
 // -----
@@ -112,7 +112,7 @@
 
 // CHECK-LABEL: @tiedCollectivesTODO
 //  CHECK-SAME: (%[[CHANNEL:.+]]: !stream.channel, %[[SEND_RECV:.+]]: !stream.resource<*>, %[[SEND_SIZE:.+]]: index, %[[RECV_SIZE:.+]]: index, %[[COUNT:.+]]: index)
-func.func private @tiedCollectivesTODO(%channel: !stream.channel, %send_recv: !stream.resource<*>, %send_size: index, %recv_size: index, %count: index) -> !stream.resource<*> {
+util.func private @tiedCollectivesTODO(%channel: !stream.channel, %send_recv: !stream.resource<*>, %send_size: index, %recv_size: index, %count: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   // CHECK: %[[RECV_CLONE:.+]] = stream.async.clone on(#hal.affinity.queue<[0]>) %[[SEND_RECV]]
   // CHECK: %[[ALL_GATHER:.+]] = stream.async.collective<all_gather : f32>[%[[COUNT]]]
@@ -123,8 +123,8 @@
       %send_recv[%c0 to %recv_size for %recv_size] :
       // CHECK-SAME: !stream.resource<*>{%[[SEND_SIZE]]} -> %[[RECV_CLONE]] as !stream.resource<*>{%[[RECV_SIZE]]}
       !stream.resource<*>{%send_size} -> %recv as !stream.resource<*>{%recv_size}
-  // CHECK: return %[[ALL_GATHER]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[ALL_GATHER]]
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -134,7 +134,7 @@
 // original contents for use by @dispatch1.
 
 // CHECK-LABEL: @tiedDispatches
-func.func private @tiedDispatches() {
+util.func private @tiedDispatches() {
   %c0_i32 = arith.constant 0 : i32
   %c1_i32 = arith.constant 1 : i32
   %c0 = arith.constant 0 : index
@@ -156,7 +156,7 @@
   // CHECK-SAME: (!stream.resource<*>{%c40}, !stream.resource<*>{%c40}) -> %[[DISPATCH0]]{%c40}
   %dispatch1 = stream.async.dispatch @ex::@dispatch1[%c1, %c1, %c1](%dispatch0[%c0 to %c40 for %c40], %splat0[%c0 to %c40 for %c40]) : (!stream.resource<*>{%c40}, !stream.resource<*>{%c40}) -> %dispatch0{%c40}
 
-  return
+  util.return
 }
 
 // -----
@@ -165,7 +165,7 @@
 // take care of them later.
 
 // CHECK-LABEL: @blockArgMove
-func.func @blockArgMove(%cond: i1, %size: index) -> (!stream.resource<*>, !stream.resource<*>) {
+util.func public @blockArgMove(%cond: i1, %size: index) -> (!stream.resource<*>, !stream.resource<*>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
@@ -185,5 +185,5 @@
   cf.cond_br %cond, ^bb1(%fill0, %bb1_1_new : !stream.resource<*>, !stream.resource<*>),
                  ^bb2(%fill0, %bb1_1_new : !stream.resource<*>, !stream.resource<*>)
 ^bb2(%bb2_0: !stream.resource<*>, %bb2_1: !stream.resource<*>):
-  return %bb2_0, %bb2_1 : !stream.resource<*>, !stream.resource<*>
+  util.return %bb2_0, %bb2_1 : !stream.resource<*>, !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_constants.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_constants.mlir
index 2b5adc3..c472e82 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_constants.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_constants.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-stream-pack-constants))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module( util.func(iree-stream-pack-constants))' %s | FileCheck %s
 
 // This is a high level test of the structure emitted by the pass.
 // Subsequent tests focus on individual components.
@@ -14,7 +14,7 @@
 // CHECK-NEXT: ]>
 
 // CHECK-LABEL: @resourceConstants
-func.func @resourceConstants() -> (!stream.resource<constant>, !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
+util.func public @resourceConstants() -> (!stream.resource<constant>, !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
   %c48 = arith.constant 48 : index
@@ -48,8 +48,8 @@
   // CHECK: %[[RES1:.+]] = stream.resource.subview %[[IF]]#1[%c64] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c8}
   // CHECK: %[[RES2:.+]] = stream.resource.subview %[[IF]]#1[%c128] : !stream.resource<constant>{%c192} -> !stream.resource<constant>{%c48}
 
-  // CHECK: return %[[RES0]], %[[RES1]], %[[RES2]], %[[IF]]#0
-  return %0#0, %0#1, %0#2, %0#3 : !stream.resource<constant>, !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
+  // CHECK: util.return %[[RES0]], %[[RES1]], %[[RES2]], %[[IF]]#0
+  util.return %0#0, %0#1, %0#2, %0#3 : !stream.resource<constant>, !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
 }
 
 // -----
@@ -63,7 +63,7 @@
 // CHECK: ]>
 
 // CHECK-LABEL: @resourceVariables
-func.func @resourceVariables() -> (!stream.resource<variable>, !stream.resource<variable>, !stream.timepoint) {
+util.func public @resourceVariables() -> (!stream.resource<variable>, !stream.resource<variable>, !stream.timepoint) {
   %c8 = arith.constant 8 : index
   %c1024 = arith.constant 1024 : index
 
@@ -80,8 +80,8 @@
     !stream.resource<variable>{%c8} = dense<[101, 102]> : tensor<2xi32>
     => !stream.timepoint
 
-  // CHECK: return %[[RES0]], %[[RES1]], %[[READ_TIMEPOINT]]
-  return %0#0, %0#1, %0#2 : !stream.resource<variable>, !stream.resource<variable>, !stream.timepoint
+  // CHECK: util.return %[[RES0]], %[[RES1]], %[[READ_TIMEPOINT]]
+  util.return %0#0, %0#1, %0#2 : !stream.resource<variable>, !stream.resource<variable>, !stream.timepoint
 }
 
 // -----
@@ -108,7 +108,7 @@
 // CHECK: ]>
 
 // CHECK-LABEL: @splitResourceConstants
-func.func @splitResourceConstants() -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint)
+util.func public @splitResourceConstants() -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint)
     attributes {stream.resources = #splitResourceConstantsConfig} {
   %c4 = arith.constant 4 : index
   %c8 = arith.constant 8 : index
@@ -134,6 +134,6 @@
     !stream.resource<constant>{%c8} = dense<[101, 102]> : tensor<2xi32>
     => !stream.timepoint
 
-  // CHECK: return %[[RES0]], %[[RES1]], %[[IF1]]#0
-  return %0#0, %0#1, %0#2 : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
+  // CHECK: util.return %[[RES0]], %[[RES1]], %[[IF1]]#0
+  util.return %0#0, %0#1, %0#2 : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_dispatch_operands.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_dispatch_operands.mlir
index 4f053e9..7977ed7 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_dispatch_operands.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_dispatch_operands.mlir

@@ -3,17 +3,17 @@
 stream.executable private @ex0 {
   stream.executable.export public @device_i1
   builtin.module {
-    // CHECK-LABEL: func.func @device_i1
+    // CHECK-LABEL:  util.func public @device_i1
     // CHECK-SAME: (%arg0: i32, %arg1: !stream.binding)
-    func.func @device_i1(%arg0: i1 {stream.values = [true, false]}, %arg1: !stream.binding) {
+     util.func public @device_i1(%arg0: i1 {stream.values = [true, false]}, %arg1: !stream.binding) {
       // CHECK-NEXT: %[[DEV_I1:.+]] = arith.trunci %arg0 {stream.values = [true, false]} : i32 to i1
       // CHECK-NEXT: util.optimization_barrier %[[DEV_I1]]
       util.optimization_barrier %arg0 : i1
-      return
+      util.return
     }
   }
 }
-func.func @host_i1(%arg0: i1) -> !stream.timepoint {
+util.func public @host_i1(%arg0: i1) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -25,7 +25,7 @@
       wo %arg1[%c0 for %c128] : !stream.resource<external>{%c128}
     }
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
@@ -33,18 +33,18 @@
 stream.executable private @ex1 {
   stream.executable.export public @device_bf16
   builtin.module {
-    // CHECK-LABEL: func.func @device_bf16
+    // CHECK-LABEL:  util.func public @device_bf16
     // CHECK-SAME: (%arg0: i32, %arg1: !stream.binding)
-    func.func @device_bf16(%arg0: bf16, %arg1: !stream.binding) {
+     util.func public @device_bf16(%arg0: bf16, %arg1: !stream.binding) {
       // CHECK-NEXT: %[[DEV_I16:.+]] = arith.trunci %arg0 : i32 to i16
       // CHECK-NEXT: %[[DEV_BF16:.+]] = arith.bitcast %[[DEV_I16]] : i16 to bf16
       // CHECK-NEXT: util.optimization_barrier %[[DEV_BF16]]
       util.optimization_barrier %arg0 : bf16
-      return
+      util.return
     }
   }
 }
-func.func @host_bf16(%arg0: bf16) -> !stream.timepoint {
+util.func public @host_bf16(%arg0: bf16) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -57,7 +57,7 @@
       wo %arg1[%c0 for %c128] : !stream.resource<external>{%c128}
     }
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
@@ -66,20 +66,20 @@
   // CHECK-LABEL: @device_i64
   stream.executable.export public @device_i64
   builtin.module {
-    // CHECK-LABEL: func.func @device_i64
+    // CHECK-LABEL:  util.func public @device_i64
     // CHECK-SAME: (%[[DEV_LO32:.+]]: i32, %[[DEV_HI32:.+]]: i32, %arg2: !stream.binding)
-    func.func @device_i64(%arg0: i64 {stream.values = [-1 : i64, 0x0000000200000003 : i64]}, %arg1: !stream.binding) {
+     util.func public @device_i64(%arg0: i64 {stream.values = [-1 : i64, 0x0000000200000003 : i64]}, %arg1: !stream.binding) {
       // CHECK-DAG: %[[DEV_LO64:.+]] = arith.extui %[[DEV_LO32]] : i32 to i64
       // CHECK-DAG: %[[DEV_HI64:.+]] = arith.extui %[[DEV_HI32]] : i32 to i64
       // CHECK-DAG: %[[DEV_HISHL:.+]] = arith.shli %[[DEV_HI64]], %c32
       // CHECK-DAG: %[[DEV_I64:.+]] = arith.ori %[[DEV_LO64]], %[[DEV_HISHL]] {stream.values = [-1, 8589934595]}
       // CHECK-NEXT: util.optimization_barrier %[[DEV_I64]]
       util.optimization_barrier %arg0 : i64
-      return
+      util.return
     }
   }
 }
-func.func @host_i64(%arg0: i64) -> !stream.timepoint {
+util.func public @host_i64(%arg0: i64) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -93,7 +93,7 @@
       wo %arg1[%c0 for %c128] : !stream.resource<external>{%c128}
     }
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
@@ -109,9 +109,9 @@
 stream.executable private @ex3 attributes {stream.resources = #resourceIndex32} {
   stream.executable.export public @device_index_32
   builtin.module {
-    // CHECK-LABEL: func.func @device_index_32
+    // CHECK-LABEL:  util.func public @device_index_32
     // CHECK-SAME: (%[[DEV_I32:.+]]: i32, %{{.+}}: !stream.binding)
-    func.func @device_index_32(%arg0: index {stream.alignment = 16 : index, stream.values = [0 : index, 1234 : index]}, %arg1: !stream.binding) {
+     util.func public @device_index_32(%arg0: index {stream.alignment = 16 : index, stream.values = [0 : index, 1234 : index]}, %arg1: !stream.binding) {
       // 32-bit device size fits in a push constant:
       // CHECK: %[[DEV_INDEX:.+]] = arith.index_castui %[[DEV_I32]] {
       // CHECK-SAME:   stream.alignment = 16 : index
@@ -119,11 +119,11 @@
       // CHECK-SAME: } : i32 to index
       // CHECK: util.optimization_barrier %[[DEV_INDEX]]
       util.optimization_barrier %arg0 : index
-      return
+      util.return
     }
   }
 }
-func.func @host_index_32(%arg0: index) -> !stream.timepoint {
+util.func public @host_index_32(%arg0: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -138,7 +138,7 @@
       wo %arg1[%c0 for %c128] : !stream.resource<external>{%c128}
     }
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
@@ -154,9 +154,9 @@
 stream.executable private @ex4 attributes {stream.resources = #resourceIndex64} {
   stream.executable.export public @device_index_64
   builtin.module {
-    // CHECK-LABEL: func.func @device_index_64
+    // CHECK-LABEL:  util.func public @device_index_64
     // CHECK-SAME: (%[[DEV_LO32:.+]]: i32, %[[DEV_HI32:.+]]: i32, %{{.+}}: !stream.binding)
-    func.func @device_index_64(%arg0: index {stream.alignment = 16 : index, stream.values = [0 : index, 1234 : index]}, %arg1: !stream.binding) {
+     util.func public @device_index_64(%arg0: index {stream.alignment = 16 : index, stream.values = [0 : index, 1234 : index]}, %arg1: !stream.binding) {
       // 64-bit device size requires joining after it was split into lo/hi:
       // CHECK-DAG: %[[DEV_LO64:.+]] = arith.extui %[[DEV_LO32]] : i32 to i64
       // CHECK-DAG: %[[DEV_HI64:.+]] = arith.extui %[[DEV_HI32]] : i32 to i64
@@ -168,11 +168,11 @@
       // CHECK-SAME: } : i64 to index
       // CHECK: util.optimization_barrier %[[DEV_INDEX]]
       util.optimization_barrier %arg0 : index
-      return
+      util.return
     }
   }
 }
-func.func @host_index_64(%arg0: index) -> !stream.timepoint {
+util.func public @host_index_64(%arg0: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -190,7 +190,7 @@
       wo %arg1[%c0 for %c128] : !stream.resource<external>{%c128}
     }
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
@@ -199,19 +199,19 @@
   // CHECK-LABEL: @device_complex_f32
   stream.executable.export public @device_complex_f32
   builtin.module {
-    // CHECK-LABEL: func.func @device_complex_f32
+    // CHECK-LABEL:  util.func public @device_complex_f32
     // CHECK-SAME: (%[[DEV_REAL_I32:.+]]: i32, %[[DEV_IMAG_I32:.+]]: i32, %arg2: !stream.binding)
-    func.func @device_complex_f32(%arg0: complex<f32>, %arg1: !stream.binding) {
+     util.func public @device_complex_f32(%arg0: complex<f32>, %arg1: !stream.binding) {
       // CHECK-DAG: %[[DEV_REAL_F32:.+]] = arith.bitcast %[[DEV_REAL_I32]] : i32 to f32
       // CHECK-DAG: %[[DEV_IMAG_F32:.+]] = arith.bitcast %[[DEV_IMAG_I32]] : i32 to f32
       // CHECK-DAG: %[[DEV_COMPLEX:.+]] = complex.create %[[DEV_REAL_F32]], %[[DEV_IMAG_F32]]
       // CHECK-NEXT: util.optimization_barrier %[[DEV_COMPLEX]]
       util.optimization_barrier %arg0 : complex<f32>
-      return
+      util.return
     }
   }
 }
-func.func @host_complex_f32(%arg0: complex<f32>) -> !stream.timepoint {
+util.func public @host_complex_f32(%arg0: complex<f32>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -226,7 +226,7 @@
       wo %arg1[%c0 for %c128] : !stream.resource<external>{%c128}
     }
   } => !stream.timepoint
-  return %1 : !stream.timepoint
+  util.return %1 : !stream.timepoint
 }
 
 // -----
@@ -235,19 +235,19 @@
   // CHECK-LABEL: @device_complex_f64_bitcast
   stream.executable.export public @device_complex_f64_bitcast
   builtin.module {
-    // CHECK-LABEL: func.func @device_complex_f64
+    // CHECK-LABEL:  util.func public @device_complex_f64
     // CHECK-SAME: (%{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %arg4: !stream.binding)
-    func.func @device_complex_f64_bitcast(%arg0: complex<f64>, %arg1: !stream.binding) {
+     util.func public @device_complex_f64_bitcast(%arg0: complex<f64>, %arg1: !stream.binding) {
       // CHECK-COUNT-2: arith.bitcast {{.*}} : i64 to f64
       // CHECK: %[[DEV_COMPLEX:.+]] = complex.create
       // CHECK-NEXT: util.optimization_barrier %[[DEV_COMPLEX]]
       util.optimization_barrier %arg0 : complex<f64>
-      return
+      util.return
     }
   }
 }
-// CHECK-LABEL: func.func @host_complex_bitcast
-func.func @host_complex_bitcast(%arg0: complex<f64>) -> !stream.timepoint {
+// CHECK-LABEL:  util.func public @host_complex_bitcast
+util.func public @host_complex_bitcast(%arg0: complex<f64>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -269,5 +269,5 @@
       wo %arg1[%c0 for %c128] : !stream.resource<external>{%c128}
     }
   } => !stream.timepoint
-  return %2 : !stream.timepoint
+  util.return %2 : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_subviews.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_subviews.mlir
index 23934c4..3e7e384 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_subviews.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_subviews.mlir

@@ -13,7 +13,7 @@
 util.global private mutable @constantGlobal : !stream.resource<constant>
 
 // CHECK-LABEL: @globalLoad
-func.func private @globalLoad() {
+util.func private @globalLoad() {
   // CHECK-NEXT: %[[RESOURCE:.+]] = util.global.load @constantGlobal : !stream.resource<constant>
   // CHECK-NEXT: %[[STORAGE_SIZE:.+]] = util.global.load @constantGlobal__storage_size : index
   // CHECK-NEXT: %[[OFFSET:.+]] = util.global.load @constantGlobal__offset : index
@@ -22,7 +22,7 @@
   %0 = util.global.load @constantGlobal : !stream.resource<constant>
   // CHECK-NEXT: util.optimization_barrier %[[SUBVIEW]]
   util.optimization_barrier %0 : !stream.resource<constant>
-  return
+  util.return
 }
 
 // -----
@@ -39,13 +39,13 @@
 
 // CHECK-LABEL: @globalStore
 // CHECK-SAME: (%[[RESOURCE:.+]]: !stream.resource<variable>, %[[STORAGE_SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[LENGTH:.+]]: index)
-func.func private @globalStore(%resource: !stream.resource<variable>) {
+util.func private @globalStore(%resource: !stream.resource<variable>) {
   // CHECK: util.global.store %[[RESOURCE]], @mutableGlobal : !stream.resource<variable>
   // CHECK: util.global.store %[[STORAGE_SIZE]], @mutableGlobal__storage_size : index
   // CHECK: util.global.store %[[OFFSET]], @mutableGlobal__offset : index
   // CHECK: util.global.store %[[LENGTH]], @mutableGlobal__length : index
   util.global.store %resource, @mutableGlobal : !stream.resource<variable>
-  return
+  util.return
 }
 
 // -----
@@ -57,7 +57,7 @@
 
 // CHECK-LABEL: @funcArgs
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !stream.resource<external>, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !stream.resource<transient>, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
-func.func private @funcArgs(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) {
+util.func private @funcArgs(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) {
   // CHECK-NEXT: %[[SUBVIEW0:.+]] = stream.resource.subview %[[RESOURCE0]][%[[OFFSET0]]] : !stream.resource<external>{%[[STORAGE_SIZE0]]} -> !stream.resource<external>{%[[LENGTH0]]}
   // CHECK-NEXT: %[[SUBVIEW1:.+]] = stream.resource.subview %[[RESOURCE1]][%[[OFFSET1]]] : !stream.resource<transient>{%[[STORAGE_SIZE1]]} -> !stream.resource<transient>{%[[LENGTH1]]}
 
@@ -65,7 +65,7 @@
   util.optimization_barrier %resource0 : !stream.resource<external>
   // CHECK-NEXT: util.optimization_barrier %[[SUBVIEW1]]
   util.optimization_barrier %resource1 : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -78,13 +78,13 @@
 // CHECK-LABEL: @funcResults
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !stream.resource<external>, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !stream.resource<transient>, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
 // CHECK-SAME: -> (!stream.resource<external>, index, index, index, !stream.resource<transient>, index, index, index)
-func.func private @funcResults(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>) {
+util.func private @funcResults(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>) {
   // NOTE: there will be extra stuff here from the arg insertion. Since the
   // return should consume the subview that was inserted we expect to directly
   // use the function arguments.
 
-  // CHECK: return %[[RESOURCE0]], %[[STORAGE_SIZE0]], %[[OFFSET0]], %[[LENGTH0]], %[[RESOURCE1]], %[[STORAGE_SIZE1]], %[[OFFSET1]], %[[LENGTH1]]
-  return %resource0, %resource1 : !stream.resource<external>, !stream.resource<transient>
+  // CHECK: util.return %[[RESOURCE0]], %[[STORAGE_SIZE0]], %[[OFFSET0]], %[[LENGTH0]], %[[RESOURCE1]], %[[STORAGE_SIZE1]], %[[OFFSET1]], %[[LENGTH1]]
+  util.return %resource0, %resource1 : !stream.resource<external>, !stream.resource<transient>
 }
 
 // -----
@@ -97,15 +97,15 @@
 
 // CHECK-LABEL: @caller
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !stream.resource<external>, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !stream.resource<transient>, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
-func.func private @caller(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) {
+util.func private @caller(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) {
   // NOTE: there will be extra stuff here from the arg insertion. The call
   // consumes the subviews and we expect the args to be passed directly.
 
-  // CHECK: %[[RET:.+]]:8 = call @callee(%[[RESOURCE0]], %[[STORAGE_SIZE0]], %[[OFFSET0]], %[[LENGTH0]],
-  // CHECK-SAME:                         %[[RESOURCE1]], %[[STORAGE_SIZE1]], %[[OFFSET1]], %[[LENGTH1]])
+  // CHECK: %[[RET:.+]]:8 = util.call @callee(%[[RESOURCE0]], %[[STORAGE_SIZE0]], %[[OFFSET0]], %[[LENGTH0]],
+  // CHECK-SAME:                              %[[RESOURCE1]], %[[STORAGE_SIZE1]], %[[OFFSET1]], %[[LENGTH1]])
   // CHECK-SAME: : (!stream.resource<external>, index, index, index, !stream.resource<transient>, index, index, index)
   // CHECK-SAME: -> (!stream.resource<external>, index, index, index, !stream.resource<transient>, index, index, index)
-  %0:2 = call @callee(%resource0, %resource1) : (!stream.resource<external>, !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>)
+  %0:2 = util.call @callee(%resource0, %resource1) : (!stream.resource<external>, !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>)
   // CHECK-NEXT: %[[RET_SUBVIEW0:.+]] = stream.resource.subview %[[RET]]#0[%[[RET]]#2] : !stream.resource<external>{%[[RET]]#1} -> !stream.resource<external>{%[[RET]]#3}
   // CHECK-NEXT: %[[RET_SUBVIEW1:.+]] = stream.resource.subview %[[RET]]#4[%[[RET]]#6] : !stream.resource<transient>{%[[RET]]#5} -> !stream.resource<transient>{%[[RET]]#7}
 
@@ -114,11 +114,11 @@
   // CHECK-NEXT: util.optimization_barrier %[[RET_SUBVIEW1]] : !stream.resource<transient>
   util.optimization_barrier %0#1 : !stream.resource<transient>
 
-  return
+  util.return
 }
 
-func.func private @callee(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>) {
-  return %arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>
+util.func private @callee(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>) {
+  util.return %arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>
 }
 
 // -----
@@ -130,7 +130,7 @@
 
 // CHECK-LABEL: @br
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !stream.resource<external>, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !stream.resource<transient>, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
-func.func private @br(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) {
+util.func private @br(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) {
   // NOTE: there will be extra stuff here from the arg insertion. The branch
   // consumes the unready resources and we expect the args to be passed directly
   // to the cf.br.
@@ -149,7 +149,7 @@
   // CHECK-NEXT: util.optimization_barrier %[[BB1_SUBVIEW1]]
   util.optimization_barrier %bb1_resource1 : !stream.resource<transient>
 
-  return
+  util.return
 }
 
 
@@ -159,7 +159,7 @@
 
 // CHECK-LABEL: @switch
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !stream.resource<external>, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !stream.resource<transient>, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
-func.func private @switch(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) {
+util.func private @switch(%resource0: !stream.resource<external>, %resource1: !stream.resource<transient>) {
   %flag = arith.constant 1 : i32
 
   // CHECK:      cf.switch
@@ -182,5 +182,5 @@
   // CHECK-NEXT: util.optimization_barrier %[[BB1_SUBVIEW1]]
   util.optimization_barrier %bb1_resource1 : !stream.resource<transient>
 
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_timepoints.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_timepoints.mlir
index a1fb5f1..28b64c4 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_timepoints.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/propagate_timepoints.mlir

@@ -10,7 +10,7 @@
 util.global private mutable @constantGlobal : !stream.resource<constant>
 
 // CHECK-LABEL: @globalLoad
-func.func @globalLoad() {
+util.func private @globalLoad() {
   // CHECK-NEXT: %[[TIMEPOINT:.+]] = util.global.load @constantGlobal__timepoint : !stream.timepoint
   // CHECK-NEXT: %[[UNREADY:.+]] = util.global.load @constantGlobal : !stream.resource<constant>
   // CHECK-NEXT: %[[SIZE:.+]] = stream.resource.size %[[UNREADY]]
@@ -18,7 +18,7 @@
   %0 = util.global.load @constantGlobal : !stream.resource<constant>
   // CHECK-NEXT: util.optimization_barrier %[[VALUE]]
   util.optimization_barrier %0 : !stream.resource<constant>
-  return
+  util.return
 }
 
 // -----
@@ -34,12 +34,12 @@
 util.global private mutable @mutableGlobal : !stream.resource<variable>
 
 // CHECK-LABEL: @globalStore
-// CHECK-SAME: (%[[TIMEPOINT:.+]]: !stream.timepoint, %[[UNREADY:.+]]: !stream.resource<variable>)
-func.func private @globalStore(%arg0: !stream.resource<variable>) {
+// CHECK-SAME: (%[[UNREADY:.+]]: !stream.resource<variable>, %[[TIMEPOINT:.+]]: !stream.timepoint)
+util.func private @globalStore(%arg0: !stream.resource<variable>) {
   //      CHECK: util.global.store %[[TIMEPOINT]], @mutableGlobal__timepoint : !stream.timepoint
   // CHECK-NEXT: util.global.store %[[UNREADY]], @mutableGlobal : !stream.resource<variable>
   util.global.store %arg0, @mutableGlobal : !stream.resource<variable>
-  return
+  util.return
 }
 
 // -----
@@ -50,9 +50,9 @@
 // This rotates waits from callers into callees.
 
 // CHECK-LABEL: @funcArgs
-// CHECK-SAME: (%[[TIMEPOINT0:.+]]: !stream.timepoint, %[[UNREADY0:.+]]: !stream.resource<external>,
-// CHECK-SAME:  %[[TIMEPOINT1:.+]]: !stream.timepoint, %[[UNREADY1:.+]]: !stream.resource<transient>)
-func.func private @funcArgs(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
+// CHECK-SAME: (%[[UNREADY0:.+]]: !stream.resource<external>, %[[TIMEPOINT0:.+]]: !stream.timepoint,
+// CHECK-SAME:  %[[UNREADY1:.+]]: !stream.resource<transient>, %[[TIMEPOINT1:.+]]: !stream.timepoint)
+util.func private @funcArgs(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
   // CHECK-NEXT: %[[SIZE0:.+]] = stream.resource.size %[[UNREADY0]] : !stream.resource<external>
   // CHECK-NEXT: %[[READY0:.+]] = stream.timepoint.await %[[TIMEPOINT0]] => %[[UNREADY0]] : !stream.resource<external>{%[[SIZE0]]}
   // CHECK-NEXT: %[[SIZE1:.+]] = stream.resource.size %[[UNREADY1]] : !stream.resource<transient>
@@ -62,7 +62,8 @@
   util.optimization_barrier %arg0 : !stream.resource<external>
   // CHECK-NEXT: util.optimization_barrier %[[READY1]]
   util.optimization_barrier %arg1 : !stream.resource<transient>
-  return
+
+  util.return
 }
 
 // -----
@@ -73,15 +74,15 @@
 // This rotates waits from callees into callers.
 
 // CHECK-LABEL: @funcResults
-// CHECK-SAME: (%[[TIMEPOINT0:.+]]: !stream.timepoint, %[[UNREADY0:.+]]: !stream.resource<external>,
-// CHECK-SAME:  %[[TIMEPOINT1:.+]]: !stream.timepoint, %[[UNREADY1:.+]]: !stream.resource<transient>)
-func.func private @funcResults(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>) {
+// CHECK-SAME: (%[[UNREADY0:.+]]: !stream.resource<external>, %[[TIMEPOINT0:.+]]: !stream.timepoint,
+// CHECK-SAME:  %[[UNREADY1:.+]]: !stream.resource<transient>, %[[TIMEPOINT1:.+]]: !stream.timepoint)
+util.func private @funcResults(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>) {
   // NOTE: there will be extra stuff here from the arg insertion. Since the
   // return should consume the await that was inserted we expect to directly use
   // the function arguments.
 
-  // CHECK: return %[[TIMEPOINT0]], %[[UNREADY0]], %[[TIMEPOINT1]], %[[UNREADY1]]
-  return %arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>
+  // CHECK: util.return %[[UNREADY0]], %[[TIMEPOINT0]], %[[UNREADY1]], %[[TIMEPOINT1]]
+  util.return %arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>
 }
 
 // -----
@@ -94,31 +95,31 @@
 // callees to callers.
 
 // CHECK-LABEL: @caller
-// CHECK-SAME: (%[[TIMEPOINT0:.+]]: !stream.timepoint, %[[UNREADY0:.+]]: !stream.resource<external>,
-// CHECK-SAME:  %[[TIMEPOINT1:.+]]: !stream.timepoint, %[[UNREADY1:.+]]: !stream.resource<transient>)
-func.func private @caller(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
+// CHECK-SAME: (%[[UNREADY0:.+]]: !stream.resource<external>, %[[TIMEPOINT0:.+]]: !stream.timepoint,
+// CHECK-SAME:  %[[UNREADY1:.+]]: !stream.resource<transient>, %[[TIMEPOINT1:.+]]: !stream.timepoint)
+util.func private @caller(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
   // NOTE: there will be extra stuff here from the arg insertion. The call
   // consumes the unready resources and we expect the args to be passed
   // directly.
 
-  // CHECK: %[[RET:.+]]:4 = call @callee(%[[TIMEPOINT0]], %[[UNREADY0]], %[[TIMEPOINT1]], %[[UNREADY1]])
-  // CHECK-SAME: : (!stream.timepoint, !stream.resource<external>, !stream.timepoint, !stream.resource<transient>) -> (!stream.timepoint, !stream.resource<external>, !stream.timepoint, !stream.resource<transient>)
-  %0:2 = call @callee(%arg0, %arg1) : (!stream.resource<external>, !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>)
-  // CHECK-NEXT: %[[RET_SIZE0:.+]] = stream.resource.size %[[RET]]#1 : !stream.resource<external>
-  // CHECK-NEXT: %[[RET_READY0:.+]] = stream.timepoint.await %[[RET]]#0 => %[[RET]]#1 : !stream.resource<external>{%[[RET_SIZE0]]}
-  // CHECK-NEXT: %[[RET_SIZE1:.+]] = stream.resource.size %[[RET]]#3 : !stream.resource<transient>
-  // CHECK-NEXT: %[[RET_READY1:.+]] = stream.timepoint.await %[[RET]]#2 => %[[RET]]#3 : !stream.resource<transient>{%[[RET_SIZE1]]}
+  // CHECK: %[[RET:.+]]:4 = util.call @callee(%[[UNREADY0]], %[[TIMEPOINT0]], %[[UNREADY1]], %[[TIMEPOINT1]])
+  // CHECK-SAME: : (!stream.resource<external>, !stream.timepoint, !stream.resource<transient>, !stream.timepoint) -> (!stream.resource<external>, !stream.timepoint, !stream.resource<transient>, !stream.timepoint)
+  %0:2 = util.call @callee(%arg0, %arg1) : (!stream.resource<external>, !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>)
+  // CHECK-NEXT: %[[RET_SIZE0:.+]] = stream.resource.size %[[RET]]#0 : !stream.resource<external>
+  // CHECK-NEXT: %[[RET_READY0:.+]] = stream.timepoint.await %[[RET]]#1 => %[[RET]]#0 : !stream.resource<external>{%[[RET_SIZE0]]}
+  // CHECK-NEXT: %[[RET_SIZE1:.+]] = stream.resource.size %[[RET]]#2 : !stream.resource<transient>
+  // CHECK-NEXT: %[[RET_READY1:.+]] = stream.timepoint.await %[[RET]]#3 => %[[RET]]#2 : !stream.resource<transient>{%[[RET_SIZE1]]}
 
   // CHECK-NEXT: util.optimization_barrier %[[RET_READY0]] : !stream.resource<external>
   util.optimization_barrier %0#0 : !stream.resource<external>
   // CHECK-NEXT: util.optimization_barrier %[[RET_READY1]] : !stream.resource<transient>
   util.optimization_barrier %0#1 : !stream.resource<transient>
 
-  return
+  util.return
 }
 
-func.func private @callee(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>) {
-  return %arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>
+util.func private @callee(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) -> (!stream.resource<external>, !stream.resource<transient>) {
+  util.return %arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>
 }
 
 // -----
@@ -129,18 +130,18 @@
 // This rotates waits on branch operands into successors.
 
 // CHECK-LABEL: @br
-// CHECK-SAME: (%[[TIMEPOINT0:.+]]: !stream.timepoint, %[[UNREADY0:.+]]: !stream.resource<external>,
-// CHECK-SAME:  %[[TIMEPOINT1:.+]]: !stream.timepoint, %[[UNREADY1:.+]]: !stream.resource<transient>)
-func.func private @br(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
+// CHECK-SAME: (%[[UNREADY0:.+]]: !stream.resource<external>, %[[TIMEPOINT0:.+]]: !stream.timepoint,
+// CHECK-SAME:  %[[UNREADY1:.+]]: !stream.resource<transient>, %[[TIMEPOINT1:.+]]: !stream.timepoint)
+util.func private @br(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
   // NOTE: there will be extra stuff here from the arg insertion. The branch
   // consumes the unready resources and we expect the args to be passed directly
   // to the cf.br.
 
-  // CHECK: cf.br ^bb1(%[[TIMEPOINT0]], %[[UNREADY0]], %[[TIMEPOINT1]], %[[UNREADY1]]
+  // CHECK: cf.br ^bb1(%[[UNREADY0]], %[[TIMEPOINT0]], %[[UNREADY1]], %[[TIMEPOINT1]]
   cf.br ^bb1(%arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>)
 
-// CHECK-NEXT: ^bb1(%[[BB1_TIMEPOINT0:.+]]: !stream.timepoint, %[[BB1_UNREADY0:.+]]: !stream.resource<external>,
-// CHECK-SAME:      %[[BB1_TIMEPOINT1:.+]]: !stream.timepoint, %[[BB1_UNREADY1:.+]]: !stream.resource<transient>):
+// CHECK-NEXT: ^bb1(%[[BB1_UNREADY0:.+]]: !stream.resource<external>, %[[BB1_TIMEPOINT0:.+]]: !stream.timepoint,
+// CHECK-SAME:      %[[BB1_UNREADY1:.+]]: !stream.resource<transient>, %[[BB1_TIMEPOINT1:.+]]: !stream.timepoint):
 ^bb1(%bb1_arg0: !stream.resource<external>, %bb1_arg1: !stream.resource<transient>):
   // CHECK-NEXT: %[[SIZE0:.+]] = stream.resource.size %[[BB1_UNREADY0]] : !stream.resource<external>
   // CHECK-NEXT: %[[READY0:.+]] = stream.timepoint.await %[[BB1_TIMEPOINT0]] => %[[BB1_UNREADY0]] : !stream.resource<external>{%8}
@@ -151,7 +152,7 @@
   util.optimization_barrier %bb1_arg0 : !stream.resource<external>
   // CHECK-NEXT: util.optimization_barrier %[[READY1]]
   util.optimization_barrier %bb1_arg1 : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -159,21 +160,21 @@
 // Tests switch terminator expansion similar to a branch test above.
 
 // CHECK-LABEL: @switch
-// CHECK-SAME: (%[[TIMEPOINT0:.+]]: !stream.timepoint, %[[UNREADY0:.+]]: !stream.resource<external>,
-// CHECK-SAME:  %[[TIMEPOINT1:.+]]: !stream.timepoint, %[[UNREADY1:.+]]: !stream.resource<transient>)
-func.func private @switch(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
+// CHECK-SAME: (%[[UNREADY0:.+]]: !stream.resource<external>, %[[TIMEPOINT0:.+]]: !stream.timepoint,
+// CHECK-SAME:  %[[UNREADY1:.+]]: !stream.resource<transient>, %[[TIMEPOINT1:.+]]: !stream.timepoint)
+util.func private @switch(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
   %flag = arith.constant 1 : i32
 
   // CHECK:      cf.switch
-  // CHECK-NEXT: default: ^bb1(%[[TIMEPOINT0]], %[[UNREADY0]], %[[TIMEPOINT1]], %[[UNREADY1]]
-  // CHECK-NEXT: 0: ^bb1(%[[TIMEPOINT0]], %[[UNREADY0]], %[[TIMEPOINT1]], %[[UNREADY1]]
+  // CHECK-NEXT: default: ^bb1(%[[UNREADY0]], %[[TIMEPOINT0]], %[[UNREADY1]], %[[TIMEPOINT1]]
+  // CHECK-NEXT: 0: ^bb1(%[[UNREADY0]], %[[TIMEPOINT0]], %[[UNREADY1]], %[[TIMEPOINT1]]
   cf.switch %flag : i32, [
     default: ^bb1(%arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>),
     0: ^bb1(%arg0, %arg1 : !stream.resource<external>, !stream.resource<transient>)
   ]
 
-// CHECK: ^bb1(%[[BB1_TIMEPOINT0:.+]]: !stream.timepoint, %[[BB1_UNREADY0:.+]]: !stream.resource<external>,
-// CHECK-SAME:      %[[BB1_TIMEPOINT1:.+]]: !stream.timepoint, %[[BB1_UNREADY1:.+]]: !stream.resource<transient>):
+//      CHECK: ^bb1(%[[BB1_UNREADY0:.+]]: !stream.resource<external>, %[[BB1_TIMEPOINT0:.+]]: !stream.timepoint,
+// CHECK-SAME:      %[[BB1_UNREADY1:.+]]: !stream.resource<transient>, %[[BB1_TIMEPOINT1:.+]]: !stream.timepoint):
 ^bb1(%bb1_arg0: !stream.resource<external>, %bb1_arg1: !stream.resource<transient>):
   // CHECK-NEXT: %[[SIZE0:.+]] = stream.resource.size %[[BB1_UNREADY0]] : !stream.resource<external>
   // CHECK-NEXT: %[[READY0:.+]] = stream.timepoint.await %[[BB1_TIMEPOINT0]] => %[[BB1_UNREADY0]] : !stream.resource<external>{%8}
@@ -184,7 +185,7 @@
   util.optimization_barrier %bb1_arg0 : !stream.resource<external>
   // CHECK-NEXT: util.optimization_barrier %[[READY1]]
   util.optimization_barrier %bb1_arg1 : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -196,9 +197,9 @@
 // This rotates waits on producers to waits on consumers.
 
 // CHECK-LABEL: @asyncExecuteConsume
-// CHECK-SAME: (%[[TIMEPOINT0:.+]]: !stream.timepoint, %[[UNREADY0:.+]]: !stream.resource<external>,
-// CHECK-SAME:  %[[TIMEPOINT1:.+]]: !stream.timepoint, %[[UNREADY1:.+]]: !stream.resource<transient>)
-func.func private @asyncExecuteConsume(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
+// CHECK-SAME: (%[[UNREADY0:.+]]: !stream.resource<external>, %[[TIMEPOINT0:.+]]: !stream.timepoint,
+// CHECK-SAME:  %[[UNREADY1:.+]]: !stream.resource<transient>, %[[TIMEPOINT1:.+]]: !stream.timepoint)
+util.func private @asyncExecuteConsume(%arg0: !stream.resource<external>, %arg1: !stream.resource<transient>) {
   // NOTE: there will be extra stuff here from the arg insertion. The execution
   // region consumes the unready resources and we expect the args to be captured
   // directly.
@@ -219,5 +220,6 @@
   %ready_results:2 = stream.timepoint.await %results_timepoint => %results#0, %results#1 : !stream.resource<external>{%arg0_size}, !stream.resource<transient>{%arg1_size}
   util.optimization_barrier %ready_results#0 : !stream.resource<external>
   util.optimization_barrier %ready_results#1 : !stream.resource<transient>
-  return
+
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/refine_usage.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/refine_usage.mlir
index 7276441..1815518 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/refine_usage.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/refine_usage.mlir

@@ -7,25 +7,25 @@
 
 // CHECK-LABEL: @propagateFuncCallee
 // CHECK-SAME: (%[[ARG:.+]]: !stream.resource<external>, %[[SIZE:.+]]: index) -> !stream.resource<external>
-func.func private @propagateFuncCallee(%arg: !stream.resource<*>, %size: index) -> !stream.resource<*> {
+util.func private @propagateFuncCallee(%arg: !stream.resource<*>, %size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
   // CHECK: stream.async.fill {{.+}} !stream.resource<external>
   %fill = stream.async.fill %c123_i32, %arg[%c0 to %c128 for %c128] : i32 -> %0 as !stream.resource<*>{%size}
-  // CHECK: return {{.+}} : !stream.resource<external>
-  return %fill : !stream.resource<*>
+  // CHECK: util.return {{.+}} : !stream.resource<external>
+  util.return %fill : !stream.resource<*>
 }
 // CHECK: @propagateFuncCaller
 // CHECK-SAME: -> !stream.resource<external>
-func.func @propagateFuncCaller(%size: index) -> !stream.resource<*> {
+util.func public @propagateFuncCaller(%size: index) -> !stream.resource<*> {
   %c123_i32 = arith.constant 123 : i32
   // CHECK: stream.async.splat {{.+}} -> !stream.resource<external>
   %splat = stream.async.splat %c123_i32 : i32 -> !stream.resource<*>{%size}
   // CHECK: call @propagateFuncCallee({{.+}}) : (!stream.resource<external>, index) -> !stream.resource<external>
-  %result = call @propagateFuncCallee(%splat, %size) : (!stream.resource<*>, index) -> !stream.resource<*>
-  // CHECK: return {{.+}} : !stream.resource<external>
-  return %result : !stream.resource<*>
+  %result = util.call @propagateFuncCallee(%splat, %size) : (!stream.resource<*>, index) -> !stream.resource<*>
+  // CHECK: util.return {{.+}} : !stream.resource<external>
+  util.return %result : !stream.resource<*>
 }
 
 // -----
@@ -34,7 +34,7 @@
 // and the type changes we don't explode.
 
 // CHECK-LABEL: @transitionTypesAcrossTies
-func.func @transitionTypesAcrossTies() -> !hal.buffer_view {
+util.func public @transitionTypesAcrossTies() -> !hal.buffer_view {
   %c4 = arith.constant 4 : index
   %c255_i32 = arith.constant 255 : i32
   // CHECK: %[[SPLAT:.+]] = stream.async.splat {{.+}} -> !stream.resource<external>
@@ -43,7 +43,7 @@
   %1 = stream.async.transfer %0 : !stream.resource<*>{%c4} -> !stream.resource<external>{%c4}
   // CHECK: stream.tensor.export %[[SPLAT]] : tensor<f32> in !stream.resource<external>{%c4} -> !hal.buffer_view
   %2 = stream.tensor.export %1 : tensor<f32> in !stream.resource<external>{%c4} -> !hal.buffer_view
-  return %2 : !hal.buffer_view
+  util.return %2 : !hal.buffer_view
 }
 
 // -----
@@ -58,7 +58,7 @@
 
 // CHECK-LABEL: @propagateBlocks
 // CHECK-SAME: (%[[COND:.+]]: i1, {{.+}}) -> (!stream.resource<transient>, !stream.resource<external>)
-func.func private @propagateBlocks(%cond: i1, %size: index) -> (!stream.resource<*>, !stream.resource<external>) {
+util.func private @propagateBlocks(%cond: i1, %size: index) -> (!stream.resource<*>, !stream.resource<external>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c123_i32 = arith.constant 123 : i32
@@ -89,8 +89,8 @@
 ^bb2(%bb2_0: !stream.resource<*>, %bb2_1: !stream.resource<*>):
   // CHECK-NOT: stream.async.transfer
   %external_transfer = stream.async.transfer %bb2_1 : !stream.resource<*>{%size} -> !stream.resource<external>{%size}
-  // CHECK: return %[[BB2_ARG0]], %[[BB2_ARG1]] : !stream.resource<transient>, !stream.resource<external>
-  return %bb2_0, %external_transfer : !stream.resource<*>, !stream.resource<external>
+  // CHECK: util.return %[[BB2_ARG0]], %[[BB2_ARG1]] : !stream.resource<transient>, !stream.resource<external>
+  util.return %bb2_0, %external_transfer : !stream.resource<*>, !stream.resource<external>
 }
 
 // -----
@@ -101,15 +101,15 @@
 // CHECK-LABEL: @conflictResolution
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[ARG0:.+]]: !stream.resource<transient>, %[[ARG1:.+]]: !stream.resource<external>, %[[SIZE:.+]]: index)
 // CHECK-SAME: -> !stream.resource<external>
-func.func @conflictResolution(%cond: i1, %arg0: !stream.resource<transient>, %arg1: !stream.resource<external>, %size: index) -> !stream.resource<*> {
+util.func public @conflictResolution(%cond: i1, %arg0: !stream.resource<transient>, %arg1: !stream.resource<external>, %size: index) -> !stream.resource<*> {
   // CHECK: %[[ARG0_EXT:.+]] = stream.async.transfer %[[ARG0]]
   %arg0_any = stream.async.transfer %arg0 : !stream.resource<transient>{%size} -> !stream.resource<*>{%size}
   // CHECK-NOT: stream.async.transfer %[[ARG1]]
   %arg1_any = stream.async.transfer %arg1 : !stream.resource<external>{%size} -> !stream.resource<*>{%size}
   // CHECK: %[[RET:.+]] = arith.select %[[COND]], %[[ARG0_EXT]], %[[ARG1]] : !stream.resource<external>
   %0 = arith.select %cond, %arg0_any, %arg1_any : !stream.resource<*>
-  // CHECK: return %[[RET]] : !stream.resource<external>
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[RET]] : !stream.resource<external>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -122,7 +122,7 @@
 // CHECK-LABEL: @transferResolution
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<constant>, %[[SIZE:.+]]: index)
 // CHECK-SAME: -> !stream.resource<external>
-func.func @transferResolution(%arg0: !stream.resource<constant>, %size: index) -> !stream.resource<*> {
+util.func public @transferResolution(%arg0: !stream.resource<constant>, %size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[ARG0_EXT:.+]] = stream.async.transfer %[[ARG0]] : !stream.resource<constant>{%[[SIZE]]} -> !stream.resource<external>{%[[SIZE]]}
@@ -130,7 +130,7 @@
   // CHECK: %[[RET0:.+]] = stream.async.dispatch @ex::@dispatch[%c1, %c1, %c1](%[[ARG0_EXT]][%c0 to %[[SIZE]] for %[[SIZE]]]) : (!stream.resource<external>{%[[SIZE]]}) -> %[[ARG0_EXT]]{%[[SIZE]]}
   %ret0_any = stream.async.dispatch @ex::@dispatch[%c1, %c1, %c1](%arg0_any[%c0 to %size for %size]) : (!stream.resource<*>{%size}) -> %arg0_any{%size}
   // return %[[RET0]] : !stream.resource<external>
-  return %ret0_any : !stream.resource<*>
+  util.return %ret0_any : !stream.resource<*>
 }
 
 // -----
@@ -139,14 +139,14 @@
 
 // CHECK-LABEL: @transferElision
 // CHECK-SAME: (%[[SIZE:.+]]: index) -> !stream.resource<external>
-func.func @transferElision(%size: index) -> !stream.resource<external> {
+util.func public @transferElision(%size: index) -> !stream.resource<external> {
   // CHECK: %[[ALLOCA:.+]] = stream.async.alloca
   %alloca = stream.async.alloca : !stream.resource<constant>{%size}
   %transfer_any = stream.async.transfer %alloca : !stream.resource<constant>{%size} -> !stream.resource<*>{%size}
   // CHECK: %[[TRANSFER_EXTERNAL:.+]] = stream.async.transfer %[[ALLOCA]] : !stream.resource<constant>{%[[SIZE]]} -> !stream.resource<external>{%[[SIZE]]}
   %transfer_external = stream.async.transfer %transfer_any : !stream.resource<*>{%size} -> !stream.resource<external>{%size}
-  // CHECK: return %[[TRANSFER_EXTERNAL]]
-  return %transfer_external : !stream.resource<external>
+  // CHECK: util.return %[[TRANSFER_EXTERNAL]]
+  util.return %transfer_external : !stream.resource<external>
 }
 
 // -----
@@ -158,25 +158,25 @@
 
 // CHECK-LABEL: @globalLoad()
 // CHECK-SAME: -> !stream.resource<variable>
-func.func private @globalLoad() -> !stream.resource<*> {
+util.func private @globalLoad() -> !stream.resource<*> {
   // CHECK: %[[VALUE:.+]] = util.global.load @variable : !stream.resource<variable>
   %value = util.global.load @variable : !stream.resource<variable>
   %size = util.global.load @variable__size : index
   // CHECK-NOT: stream.async.transfer
   %0 = stream.async.transfer %value : !stream.resource<variable>{%size} -> !stream.resource<*>{%size}
-  // CHECK: return %[[VALUE]]
-  return %0 : !stream.resource<*>
+  // CHECK: util.return %[[VALUE]]
+  util.return %0 : !stream.resource<*>
 }
 
 // CHECK-LABEL: @globalStore
 // CHECK-SAME: (%[[VALUE:.+]]: !stream.resource<variable>, %[[SIZE:.+]]: index)
-func.func private @globalStore(%value: !stream.resource<*>, %size: index) {
+util.func private @globalStore(%value: !stream.resource<*>, %size: index) {
   // CHECK-NOT: stream.async.transfer
   %0 = stream.async.transfer %value : !stream.resource<*>{%size} -> !stream.resource<variable>{%size}
   // CHECK: util.global.store %[[VALUE]], @variable : !stream.resource<variable>
   util.global.store %0, @variable : !stream.resource<variable>
   util.global.store %size, @variable__size : index
-  return
+  util.return
 }
 
 // -----
@@ -184,7 +184,7 @@
 // Tests that explicit resource allocations are refined.
 
 // CHECK-LABEL: @explicitAlloc
-func.func @explicitAlloc() -> !hal.buffer_view {
+util.func public @explicitAlloc() -> !hal.buffer_view {
   %c0 = arith.constant 0 : index
   // CHECK: %[[ALLOC:.+]] = stream.resource.alloc : !stream.resource<external>{%c0}
   %0 = stream.resource.alloc : !stream.resource<*>{%c0}
@@ -192,7 +192,7 @@
   %1 = stream.async.transfer %0 : !stream.resource<*>{%c0} -> !stream.resource<external>{%c0}
   // CHECK: stream.tensor.export %[[ALLOC]] : tensor<f32> in !stream.resource<external>{%c0} -> !hal.buffer_view
   %2 = stream.tensor.export %1 : tensor<f32> in !stream.resource<external>{%c0} -> !hal.buffer_view
-  return %2 : !hal.buffer_view
+  util.return %2 : !hal.buffer_view
 }
 
 // -----
@@ -200,7 +200,7 @@
 // Tests that async allocations that escape are turned into non-transient allocs.
 
 // CHECK-LABEL: @escapingAlloca
-func.func @escapingAlloca() -> !hal.buffer_view {
+util.func public @escapingAlloca() -> !hal.buffer_view {
   %c123 = arith.constant 123 : index
   // CHECK: %[[ALLOCA:.+]] = stream.async.alloca : !stream.resource<external>{%c123}
   %0 = stream.async.alloca : !stream.resource<*>{%c123}
@@ -208,13 +208,13 @@
   %1 = stream.async.transfer %0 : !stream.resource<*>{%c123} -> !stream.resource<external>{%c123}
   // CHECK: stream.tensor.export %[[ALLOCA]] : tensor<f32> in !stream.resource<external>{%c123} -> !hal.buffer_view
   %2 = stream.tensor.export %1 : tensor<f32> in !stream.resource<external>{%c123} -> !hal.buffer_view
-  return %2 : !hal.buffer_view
+  util.return %2 : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @testIf
-func.func @testIf(%arg0: i1, %arg1: !stream.resource<*>, %arg2: !stream.resource<*>) -> !stream.resource<*> {
+util.func public @testIf(%arg0: i1, %arg1: !stream.resource<*>, %arg2: !stream.resource<*>) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c4 = arith.constant 4 : index
   // CHECK: %[[IF:.+]] = scf.if
@@ -233,13 +233,13 @@
     // CHECK-SAME: !stream.resource<external>
     scf.yield %arg1 : !stream.resource<*>
   }
-  return %if : !stream.resource<*>
+  util.return %if : !stream.resource<*>
 }
 
 // -----
 
 // CHECK: @testWhile
-func.func @testWhile(%arg0: i32, %arg1: !stream.resource<*>) -> (i32, !stream.resource<*>) {
+util.func public @testWhile(%arg0: i32, %arg1: !stream.resource<*>) -> (i32, !stream.resource<*>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : i32
   %c4 = arith.constant 4 : index
@@ -260,8 +260,8 @@
     // CHECK-SAME: !stream.resource<external>
     scf.yield %add, %disp : i32, !stream.resource<*>
   }
-  // CHECK: return %[[IF]]#0, %[[IF]]#1 : i32, !stream.resource<external>
-  return %while#0, %while#1 : i32, !stream.resource<*>
+  // CHECK: util.return %[[IF]]#0, %[[IF]]#1 : i32, !stream.resource<external>
+  util.return %while#0, %while#1 : i32, !stream.resource<*>
 }
 
 // -----
@@ -269,7 +269,7 @@
 // CHECK-LABEL: @testWhileRecurse
 // CHECK-SAME: %[[ARG0:.+]]: !stream.resource<external>
 // CHECK-SAME: -> !stream.resource<external>
-func.func @testWhileRecurse(%arg0 : !stream.resource<*>) -> !stream.resource<external> {
+util.func public @testWhileRecurse(%arg0 : !stream.resource<*>) -> !stream.resource<external> {
   // CHECK-DAG: %[[C0:.+]] = arith.constant 0
   // CHECK-DAG: %[[C1:.+]] = arith.constant 1
   // CHECK-DAG: %[[C4:.+]] = arith.constant 4
@@ -304,8 +304,8 @@
   }
   %transfer = stream.async.transfer %while#0 : !stream.resource<*>{%while#1} -> !stream.resource<external>{%while#1}
 
-  // CHECK: return %[[WHILE]]#0
-  return %transfer : !stream.resource<external>
+  // CHECK: util.return %[[WHILE]]#0
+  util.return %transfer : !stream.resource<external>
 }
 
 // -----
@@ -313,7 +313,7 @@
 // CHECK-LABEL: @testForOp
 // CHECK-SAME: %[[ARG0:.+]]: index
 // CHECK-SAME: %[[ARG1:.+]]: !stream.resource<external>
-func.func @testForOp(%arg0 : index, %arg1 : !stream.resource<*>) -> !stream.resource<external> {
+util.func public @testForOp(%arg0 : index, %arg1 : !stream.resource<*>) -> !stream.resource<external> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
@@ -340,6 +340,6 @@
   %dispatch5 = stream.async.dispatch @dispatch4(%for[%c0 to %arg0 for %arg0]) : (!stream.resource<*>{%c4}) -> !stream.resource<*>{%c4}
   %transfer = stream.async.transfer %dispatch5 : !stream.resource<*>{%arg0} -> !stream.resource<external>{%arg0}
 
-  // CHECK: return %[[DISP4]] : !stream.resource<external>
-  return %transfer : !stream.resource<external>
+  // CHECK: util.return %[[DISP4]] : !stream.resource<external>
+  util.return %transfer : !stream.resource<external>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir
index 39319b0..00f5c32 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir

@@ -8,7 +8,7 @@
 // CHECK-SAME: (%[[OPERAND_TIMEPOINT:.+]]: !stream.timepoint,
 // CHECK-SAME:  %[[OPERAND:.+]]: !stream.resource<transient>,
 // CHECK-SAME   %[[SIZE:.+]]: index)
-func.func @extractConstants(%timepoint: !stream.timepoint, %operand: !stream.resource<transient>, %size: index) {
+util.func public @extractConstants(%timepoint: !stream.timepoint, %operand: !stream.resource<transient>, %size: index) {
   %c0 = arith.constant 0 : index
   %c8 = arith.constant 8 : index
   %c16 = arith.constant 16 : index
@@ -52,7 +52,7 @@
   util.optimization_barrier %results#2 : !stream.resource<variable>
   // CHECK: util.optimization_barrier %[[OPERAND]]
   util.optimization_barrier %results#3 : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -61,7 +61,7 @@
 
 // CHECK-LABEL: @explicitAllocs
 // CHECK-SAME: (%[[SIZE:.+]]: index)
-func.func @explicitAllocs(%size: index) {
+util.func public @explicitAllocs(%size: index) {
   // CHECK: %[[ALLOC:.+]] = stream.resource.alloc : !stream.resource<external>{%[[SIZE]]}
   %alloc = stream.resource.alloc : !stream.resource<external>{%size}
   // CHECK: util.optimization_barrier %[[ALLOC]]
@@ -72,7 +72,7 @@
   %empty = stream.resource.alloc : !stream.resource<transient>{%c0}
   // CHECK: util.optimization_barrier %[[EMPTY]]
   util.optimization_barrier %empty : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -82,7 +82,7 @@
 
 // CHECK-LABEL: @passthroughOperands
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
-func.func @passthroughOperands(%operand: !stream.resource<transient>, %size: index) {
+util.func public @passthroughOperands(%operand: !stream.resource<transient>, %size: index) {
   // CHECK: = stream.cmd.execute with(%[[OPERAND]] as %[[CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]})
   %result, %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource<transient>{%size}) -> (%operand as !stream.resource<transient>{%size}) {
     stream.yield %capture : !stream.resource<transient>{%size}
@@ -90,14 +90,14 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[OPERAND]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @capturedOperands
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
-func.func @capturedOperands(%operand: !stream.resource<transient>, %size: index) {
+util.func public @capturedOperands(%operand: !stream.resource<transient>, %size: index) {
   // CHECK: stream.cmd.execute
   // CHECK-SAME: => with(%[[OPERAND]] as %[[CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]}
   %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource<transient>{%size}) {
@@ -105,7 +105,7 @@
     %0 = stream.async.clone %capture : !stream.resource<transient>{%size} -> !stream.resource<transient>{%size}
     stream.yield
   } => !stream.timepoint
-  return
+  util.return
 }
 
 // -----
@@ -114,7 +114,7 @@
 
 // CHECK-LABEL: @tiedOperands
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
-func.func @tiedOperands(%operand: !stream.resource<transient>, %size: index) {
+util.func public @tiedOperands(%operand: !stream.resource<transient>, %size: index) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c255_i32 = arith.constant 255 : i32
@@ -126,7 +126,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[OPERAND]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -136,7 +136,7 @@
 // CHECK-LABEL: @tiedOperandSubviews
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<external>,
 // CHECK-SAME:  %[[SIZE:.+]]: index, %[[OFFSET0:.+]]: index, %[[OFFSET1:.+]]: index, %[[OFFSET2:.+]]: index, %[[LENGTH0:.+]]: index, %[[LENGTH1:.+]]: index, %[[LENGTH2:.+]]: index)
-func.func @tiedOperandSubviews(%operand: !stream.resource<external>, %size: index, %offset0: index, %offset1: index, %offset2: index, %length0: index, %length1: index, %length2: index) {
+util.func public @tiedOperandSubviews(%operand: !stream.resource<external>, %size: index, %offset0: index, %offset1: index, %offset2: index, %length0: index, %length1: index, %length2: index) {
   %c0 = arith.constant 0 : index
   // CHECK: %[[SUBVIEW_OFFSET:.+]] = arith.addi %[[OFFSET0]], %[[OFFSET1]]
   // CHECK: %[[SUBVIEW:.+]] = stream.resource.subview %[[OPERAND]][%[[SUBVIEW_OFFSET]]] {{.*}} -> !stream.resource<external>{%[[LENGTH1]]}
@@ -156,7 +156,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[SUBVIEW]]
   util.optimization_barrier %result1 : !stream.resource<external>
-  return
+  util.return
 }
 
 // -----
@@ -166,7 +166,7 @@
 // CHECK-LABEL: @aliasPropagation
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<external>,
 // CHECK-SAME:  %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[LENGTH:.+]]: index)
-func.func @aliasPropagation(%operand: !stream.resource<external>, %size: index, %offset: index, %length: index) {
+util.func public @aliasPropagation(%operand: !stream.resource<external>, %size: index, %offset: index, %length: index) {
   %c0 = arith.constant 0 : index
   // CHECK: stream.cmd.execute with(%[[OPERAND]] as %[[CAPTURE:.+]]: !stream.resource<external>{%[[SIZE]]})
   %result, %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource<external>{%size}) -> (%operand as !stream.resource<external>{%size}) {
@@ -178,7 +178,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[OPERAND]]
   util.optimization_barrier %result : !stream.resource<external>
-  return
+  util.return
 }
 
 // -----
@@ -188,7 +188,7 @@
 
 // CHECK-LABEL: @producedResults
 // CHECK-SAME: (%[[SIZE0:.+]]: index, %[[SIZE1:.+]]: index)
-func.func @producedResults(%size0: index, %size1: index) {
+util.func public @producedResults(%size0: index, %size1: index) {
   %c254_i32 = arith.constant 254 : i32
   %c255_i32 = arith.constant 255 : i32
   //      CHECK: %[[PACK:.+]]:3 = stream.resource.pack slices({
@@ -214,7 +214,7 @@
   util.optimization_barrier %results#0 : !stream.resource<transient>
   // CHECK: util.optimization_barrier %[[SUBALLOCA1]]
   util.optimization_barrier %results#1 : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -225,7 +225,7 @@
 
 // CHECK-LABEL: @locals
 // CHECK-SAME: (%[[SIZE0:.+]]: index, %[[SIZE1:.+]]: index, %[[AWAIT_TIMEPOINT:.+]]: !stream.timepoint)
-func.func @locals(%size0: index, %size1: index, %await_timepoint: !stream.timepoint) -> !stream.timepoint {
+util.func public @locals(%size0: index, %size1: index, %await_timepoint: !stream.timepoint) -> !stream.timepoint {
   %c254_i32 = arith.constant 254 : i32
   %c255_i32 = arith.constant 255 : i32
   //      CHECK: %[[SLICES:.+]]:3 = stream.resource.pack on(#hal.affinity.queue<[0]>) slices({
@@ -245,8 +245,8 @@
   } => !stream.timepoint
   // CHECK: %[[DEALLOCA_TIMEPOINT:.+]] = stream.resource.dealloca on(#hal.affinity.queue<[0]>) await(%[[EXEC_TIMEPOINT]]) => %[[ALLOCA]] : !stream.resource<transient>{%[[SLICES]]#0} => !stream.timepoint
   // CHECK: %[[JOIN:.+]] = stream.timepoint.join max(%[[DEALLOCA_TIMEPOINT]], %[[EXEC_TIMEPOINT]]) => !stream.timepoint
-  // CHECK: return %[[JOIN]]
-  return %result_timepoint : !stream.timepoint
+  // CHECK: util.return %[[JOIN]]
+  util.return %result_timepoint : !stream.timepoint
 }
 
 // -----
@@ -257,7 +257,7 @@
 
 // CHECK-LABEL: @concurrentRegions
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
-func.func @concurrentRegions(%operand: !stream.resource<transient>, %size: index) {
+util.func public @concurrentRegions(%operand: !stream.resource<transient>, %size: index) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c254_i32 = arith.constant 254 : i32
@@ -281,14 +281,14 @@
   util.optimization_barrier %results#0 : !stream.resource<transient>
   // CHECK: util.optimization_barrier %[[ALLOCA]]
   util.optimization_barrier %results#1 : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @applyAsyncSplatOp
 // CHECK-SAME: (%[[SIZE:.+]]: index)
-func.func @applyAsyncSplatOp(%size: index) {
+util.func public @applyAsyncSplatOp(%size: index) {
   %c255_i32 = arith.constant 255 : i32
   // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource<transient>{%[[SIZE]]}
   // CHECK: stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]])
@@ -300,14 +300,14 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[ALLOCA]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @applyAsyncCloneOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
-func.func @applyAsyncCloneOp(%operand: !stream.resource<transient>, %size: index) {
+util.func public @applyAsyncCloneOp(%operand: !stream.resource<transient>, %size: index) {
   // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource<transient>{%[[SIZE]]}
   // CHECK: stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]])
   // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]},
@@ -320,7 +320,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[ALLOCA]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -330,7 +330,7 @@
 
 // CHECK-LABEL: @applyAsyncSliceOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
-func.func @applyAsyncSliceOp(%operand: !stream.resource<transient>, %size: index) {
+util.func public @applyAsyncSliceOp(%operand: !stream.resource<transient>, %size: index) {
   %c16 = arith.constant 16 : index
   %c128 = arith.constant 128 : index
   %c144 = arith.constant 144 : index
@@ -346,14 +346,14 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[ALLOCA]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @applyAsyncFillOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
-func.func @applyAsyncFillOp(%operand: !stream.resource<transient>, %size: index) {
+util.func public @applyAsyncFillOp(%operand: !stream.resource<transient>, %size: index) {
   %c16 = arith.constant 16 : index
   %c128 = arith.constant 128 : index
   %c144 = arith.constant 144 : index
@@ -366,7 +366,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[OPERAND]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -378,7 +378,7 @@
 // CHECK-SAME: (%[[UPDATE:.+]]: !stream.resource<external>,
 // CHECK-SAME:  %[[OPERAND:.+]]: !stream.resource<transient>,
 // CHECK-SAME:  %[[SIZE:.+]]: index)
-func.func @applyAsyncUpdateOp(%update: !stream.resource<external>, %operand: !stream.resource<transient>, %size: index) {
+util.func public @applyAsyncUpdateOp(%update: !stream.resource<external>, %operand: !stream.resource<transient>, %size: index) {
   %c16 = arith.constant 16 : index
   %c128 = arith.constant 128 : index
   %c144 = arith.constant 144 : index
@@ -393,7 +393,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[OPERAND]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -402,7 +402,7 @@
 // CHECK-SAME: (%[[SOURCE:.+]]: !stream.resource<external>,
 // CHECK-SAME:  %[[TARGET:.+]]: !stream.resource<transient>,
 // CHECK-SAME:  %[[SIZE:.+]]: index)
-func.func @applyAsyncCopyOp(%source: !stream.resource<external>, %target: !stream.resource<transient>, %size: index) {
+util.func public @applyAsyncCopyOp(%source: !stream.resource<external>, %target: !stream.resource<transient>, %size: index) {
   %c16 = arith.constant 16 : index
   %c128 = arith.constant 128 : index
   %c144 = arith.constant 144 : index
@@ -417,7 +417,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[TARGET]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -430,7 +430,7 @@
 // CHECK-SAME: (%[[SOURCE:.+]]: !stream.resource<external>,
 // CHECK-SAME:  %[[TARGET:.+]]: !stream.resource<transient>,
 // CHECK-SAME:  %[[SIZE:.+]]: index)
-func.func @applyConcurrentAsyncCopyOp(%source: !stream.resource<external>, %target: !stream.resource<transient>, %size: index) {
+util.func public @applyConcurrentAsyncCopyOp(%source: !stream.resource<external>, %target: !stream.resource<transient>, %size: index) {
   %c0 = arith.constant 0 : index
   %c16 = arith.constant 16 : index
   %c128 = arith.constant 128 : index
@@ -453,7 +453,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[TARGET]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -465,7 +465,7 @@
 // CHECK-SAME:  %[[SEND:.+]]: !stream.resource<external>, %[[SEND_SIZE:[a-z0-9]+]]: index,
 // CHECK-SAME:  %[[RECV:.+]]: !stream.resource<transient>, %[[RECV_SIZE:[a-z0-9]+]]: index,
 // CHECK-SAME:  %[[COUNT:[a-z0-9]+]]: index)
-func.func @applyAsyncCollectiveOpOutOfPlace(%channel: !stream.channel, %send: !stream.resource<external>, %send_size: index, %recv: !stream.resource<transient>, %recv_size: index, %count: index) {
+util.func public @applyAsyncCollectiveOpOutOfPlace(%channel: !stream.channel, %send: !stream.resource<external>, %send_size: index, %recv: !stream.resource<transient>, %recv_size: index, %count: index) {
   %c0 = arith.constant 0 : index
   // CHECK: stream.cmd.execute
   // CHECK-SAME: with(%[[SEND]] as %[[SEND_CAPTURE:.+]]: !stream.resource<external>{%[[SEND_SIZE]]},
@@ -482,7 +482,7 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[RECV]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -491,7 +491,7 @@
 
 // CHECK-LABEL: @applyAsyncTransferOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
-func.func @applyAsyncTransferOp(%operand: !stream.resource<transient>, %size: index) {
+util.func public @applyAsyncTransferOp(%operand: !stream.resource<transient>, %size: index) {
   // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource<transient>{%[[SIZE]]}
   // CHECK: stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]])
   // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]},
@@ -504,14 +504,14 @@
   } => !stream.timepoint
   // CHECK: util.optimization_barrier %[[ALLOCA]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @applyAsyncDispatchOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[END:.+]]: index, %[[LENGTH:.+]]: index)
-func.func @applyAsyncDispatchOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
+util.func public @applyAsyncDispatchOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
@@ -533,7 +533,7 @@
   util.optimization_barrier %results#0 : !stream.resource<transient>
   // CHECK: util.optimization_barrier %[[ALLOCA]]
   util.optimization_barrier %results#1 : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -543,7 +543,7 @@
 
 // CHECK-LABEL: @applyAsyncDispatchUnusedOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[END:.+]]: index, %[[LENGTH:.+]]: index)
-func.func @applyAsyncDispatchUnusedOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
+util.func public @applyAsyncDispatchUnusedOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
@@ -572,7 +572,7 @@
   util.optimization_barrier %result_timepoint : !stream.timepoint
   // CHECK: util.optimization_barrier %[[OPERAND]]
   util.optimization_barrier %result : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -582,7 +582,7 @@
 
 // CHECK-LABEL: @applyAsyncCallOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[END:.+]]: index, %[[LENGTH:.+]]: index)
-func.func @applyAsyncCallOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
+util.func public @applyAsyncCallOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c4 = arith.constant 4 : index
@@ -602,7 +602,7 @@
   util.optimization_barrier %results#0 : !stream.resource<transient>
   // CHECK: util.optimization_barrier %[[ALLOCA]]
   util.optimization_barrier %results#1 : !stream.resource<transient>
-  return
+  util.return
 }
 
 // -----
@@ -612,15 +612,15 @@
 // CHECK-LABEL: @asyncLoadStore
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<staging>,
 // CHECK-SAME:  %[[SIZE:.+]]: index)
-func.func @asyncLoadStore(%operand: !stream.resource<staging>, %size: index) -> f32 {
+util.func public @asyncLoadStore(%operand: !stream.resource<staging>, %size: index) -> f32 {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 5.4 : f32
   // CHECK: stream.resource.store %cst, %[[OPERAND]][%c0] : f32 -> !stream.resource<staging>{%[[SIZE]]}
   %0 = stream.async.store %cst, %operand[%c0] : f32 -> %operand as !stream.resource<staging>{%size}
   // CHECK: %[[RESULT:.+]] = stream.resource.load %[[OPERAND]][%c0] : !stream.resource<staging>{%[[SIZE]]} -> f32
   %1 = stream.async.load %0[%c0] : !stream.resource<staging>{%size} -> f32
-  // CHECK: return %[[RESULT]]
-  return %1 : f32
+  // CHECK: util.return %[[RESULT]]
+  util.return %1 : f32
 }
 
 // -----
@@ -630,7 +630,7 @@
 // CHECK-LABEL: @scfFor
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<staging>,
 // CHECK-SAME:  %[[SIZE:.+]]: index)
-func.func @scfFor(%operand: !stream.resource<staging>, %size: index) -> f32 {
+util.func public @scfFor(%operand: !stream.resource<staging>, %size: index) -> f32 {
   %c0 = arith.constant 0 : index
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
@@ -650,6 +650,6 @@
     scf.yield %2 : f32
   }
 
-  // CHECK: return %[[FOR]]
-  return %sum : f32
+  // CHECK: util.return %[[FOR]]
+  util.return %sum : f32
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_concurrency.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_concurrency.mlir
index db42945..14396ce 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_concurrency.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_concurrency.mlir

@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-stream-schedule-concurrency))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module( util.func(iree-stream-schedule-concurrency))" %s | FileCheck %s
 
 // Tests that when favor=min-peak-memory we assume ops are in an order that
 // reduces live memory ranges and only optimistically put them in concurrency
@@ -6,7 +6,7 @@
 
 // CHECK-LABEL: @partitioningForMinPeakMemory
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<external>, %[[ARG1:.+]]: !stream.resource<external>)
-func.func @partitioningForMinPeakMemory(%arg0: !stream.resource<external>, %arg1: !stream.resource<external>) -> !stream.resource<external>
+util.func public @partitioningForMinPeakMemory(%arg0: !stream.resource<external>, %arg1: !stream.resource<external>) -> !stream.resource<external>
     attributes {stream.partitioning = #stream.partitioning_config<"min-peak-memory">} {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -45,7 +45,7 @@
     stream.yield %5 : !stream.resource<external>{%c20}
   } => !stream.timepoint
   %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c20}
-  return %0 : !stream.resource<external>
+  util.return %0 : !stream.resource<external>
 }
 
 // -----
@@ -55,7 +55,7 @@
 
 // CHECK-LABEL: @partitioningForMaxConcurrency
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<external>, %[[ARG1:.+]]: !stream.resource<external>)
-func.func @partitioningForMaxConcurrency(%arg0: !stream.resource<external>, %arg1: !stream.resource<external>) -> !stream.resource<external>
+util.func public @partitioningForMaxConcurrency(%arg0: !stream.resource<external>, %arg1: !stream.resource<external>) -> !stream.resource<external>
     attributes {stream.partitioning = #stream.partitioning_config<"max-concurrency">} {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -98,7 +98,7 @@
     stream.yield %5 : !stream.resource<external>{%c20}
   } => !stream.timepoint
   %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<external>{%c20}
-  return %0 : !stream.resource<external>
+  util.return %0 : !stream.resource<external>
 }
 
 // -----
@@ -109,7 +109,7 @@
 
 // CHECK-LABEL: @keepTiedOpsSeparate
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<external>)
-func.func @keepTiedOpsSeparate(%arg0: !stream.resource<external>) -> (!stream.resource<external>, !stream.resource<external>) {
+util.func public @keepTiedOpsSeparate(%arg0: !stream.resource<external>) -> (!stream.resource<external>, !stream.resource<external>) {
   %c0 = arith.constant 0 : index
   %c4 = arith.constant 4 : index
   // CHECK: stream.async.execute
@@ -123,7 +123,7 @@
     // CHECK-NEXT: stream.yield
     stream.yield %1, %2 : !stream.resource<external>{%c4}, !stream.resource<external>{%c4}
   } => !stream.timepoint
-  return %results#0, %results#1 : !stream.resource<external>, !stream.resource<external>
+  util.return %results#0, %results#1 : !stream.resource<external>, !stream.resource<external>
 }
 
 // -----
@@ -138,7 +138,7 @@
 // CHECK-SAME:  %[[SEND0:.+]]: !stream.resource<external>, %[[SEND0_SIZE:[a-z0-9]+]]: index,
 // CHECK-SAME:  %[[SEND1:.+]]: !stream.resource<transient>, %[[SEND1_SIZE:[a-z0-9]+]]: index,
 // CHECK-SAME:  %[[RECV_SIZE:[a-z0-9]+]]: index, %[[COUNT:[a-z0-9]+]]: index)
-func.func @groupCollectiveOps(%channel: !stream.channel, %send0: !stream.resource<external>, %send0_size: index, %send1: !stream.resource<transient>, %send1_size: index, %recv_size: index, %count: index) {
+util.func public @groupCollectiveOps(%channel: !stream.channel, %send0: !stream.resource<external>, %send0_size: index, %send1: !stream.resource<transient>, %send1_size: index, %recv_size: index, %count: index) {
   %c0 = arith.constant 0 : index
   // CHECK: stream.async.execute
   %result:2, %result_timepoint = stream.async.execute
@@ -184,5 +184,5 @@
   } => !stream.timepoint
   util.optimization_barrier %result#0 : !stream.resource<transient>
   util.optimization_barrier %result#1 : !stream.resource<transient>
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir
index fab2dfb..357a936 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir

@@ -1,10 +1,10 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-stream-schedule-execution))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module( util.func(iree-stream-schedule-execution))" %s | FileCheck %s
 
 // Tests basic partitioning of multiple ops.
 
 // CHECK-LABEL: @partitioning
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<external>, %[[ARG1:.+]]: !stream.resource<external>)
-func.func @partitioning(%arg0: !stream.resource<external>, %arg1: !stream.resource<external>) -> !stream.resource<external> {
+util.func public @partitioning(%arg0: !stream.resource<external>, %arg1: !stream.resource<external>) -> !stream.resource<external> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -28,8 +28,8 @@
   // CHECK-NEXT: stream.yield %[[DISPATCH2]] : !stream.resource<external>{%c20}
   // CHECK-NEXT: } => !stream.timepoint
   // CHECK-NEXT: %[[READY:.+]] = stream.timepoint.await %[[TIMEPOINT]] => %[[RESULT]] : !stream.resource<external>{%c20}
-  // CHECK-NEXT: return %[[READY]]
-  return %6 : !stream.resource<external>
+  // CHECK-NEXT: util.return %[[READY]]
+  util.return %6 : !stream.resource<external>
 }
 
 // -----
@@ -40,7 +40,7 @@
 
 // CHECK-LABEL: @partitioningWithAffinities
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<external>)
-func.func @partitioningWithAffinities(%arg0: !stream.resource<external>) -> !stream.resource<external> {
+util.func public @partitioningWithAffinities(%arg0: !stream.resource<external>) -> !stream.resource<external> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -74,8 +74,8 @@
   // CHECK-NEXT: %[[READY:.+]] = stream.timepoint.await
   // CHECK-SAME:   on(#hal.affinity.queue<[1]>)
   // CHECK-SAME:   %[[TIMEPOINT1]] => %[[RESULT]] : !stream.resource<external>{%c20}
-  // CHECK-NEXT: return %[[READY]]
-  return %dispatch2 : !stream.resource<external>
+  // CHECK-NEXT: util.return %[[READY]]
+  util.return %dispatch2 : !stream.resource<external>
 }
 
 // -----
@@ -86,7 +86,7 @@
 
 // CHECK-LABEL: @partitioningWithConcurrentAffinities
 // CHECK-SAME: (%[[ARG0:.+]]: !stream.resource<external>)
-func.func @partitioningWithConcurrentAffinities(%arg0: !stream.resource<external>) -> !stream.resource<external> {
+util.func public @partitioningWithConcurrentAffinities(%arg0: !stream.resource<external>) -> !stream.resource<external> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -128,8 +128,8 @@
   // CHECK-NEXT: %[[READY:.+]] = stream.timepoint.await
   // CHECK-SAME:   on(#hal.affinity.queue<[2]>)
   // CHECK-SAME:   %[[TIMEPOINT2]] => %[[RESULT]] : !stream.resource<external>{%c20}
-  // CHECK-NEXT: return %[[READY]]
-  return %dispatch2 : !stream.resource<external>
+  // CHECK-NEXT: util.return %[[READY]]
+  util.return %dispatch2 : !stream.resource<external>
 }
 
 // -----
@@ -139,7 +139,7 @@
 // happen in-place on the splat and we expect the execution regions to be tied.
 
 // CHECK-LABEL: @partitionWithinBlocks
-func.func @partitionWithinBlocks(%cond: i1) -> !stream.resource<transient> {
+util.func public @partitionWithinBlocks(%cond: i1) -> !stream.resource<transient> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c1280 = arith.constant 1280 : index
@@ -156,8 +156,8 @@
   // CHECK: stream.async.dispatch @ex::@dispatch_0[%c1, %c1, %c1](%[[BB1_SPLAT]][{{.+}}]) : (!stream.resource<transient>{%c1280}) -> %[[BB1_SPLAT]]{%c1280}
   %3 = stream.async.dispatch @ex::@dispatch_0[%c1, %c1, %c1](%splat[%c0 to %c1280 for %c1280]) : (!stream.resource<transient>{%c1280}) -> %splat{%c1280}
   // CHECK: %[[BB1_READY:.+]] = stream.timepoint.await %[[BB1_TIMEPOINT]] => %[[BB1_RESULT]]
-  // CHECK: return %[[BB1_READY]]
-  return %3 : !stream.resource<transient>
+  // CHECK: util.return %[[BB1_READY]]
+  util.return %3 : !stream.resource<transient>
 ^bb2:
   // CHECK: %[[BB2_RESULT:.+]], %[[BB2_TIMEPOINT:.+]] = stream.async.execute await(%[[SPLAT_TIMEPOINT]]) =>
   // CHECK-SAME: with(%[[SPLAT]] as %[[BB2_SPLAT:.+]]: !stream.resource<transient>{%c1280})
@@ -165,8 +165,8 @@
   // CHECK: stream.async.dispatch @ex::@dispatch_1[%c1, %c1, %c1](%[[BB2_SPLAT]][{{.+}}]) : (!stream.resource<transient>{%c1280}) -> %[[BB2_SPLAT]]{%c1280}
   %4 = stream.async.dispatch @ex::@dispatch_1[%c1, %c1, %c1](%splat[%c0 to %c1280 for %c1280]) : (!stream.resource<transient>{%c1280}) -> %splat{%c1280}
   // CHECK: %[[BB2_READY:.+]] = stream.timepoint.await %[[BB2_TIMEPOINT]] => %[[BB2_RESULT]]
-  // CHECK: return %[[BB2_READY]]
-  return %4 : !stream.resource<transient>
+  // CHECK: util.return %[[BB2_READY]]
+  util.return %4 : !stream.resource<transient>
 }
 
 // -----
@@ -176,7 +176,7 @@
 // single block and break the assumption that one block == one partition.
 
 // CHECK-LABEL: @deviceHostDevice
-func.func @deviceHostDevice() -> !stream.resource<transient> {
+util.func public @deviceHostDevice() -> !stream.resource<transient> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c123_i8 = arith.constant 123 : i8
@@ -201,8 +201,8 @@
   %5 = stream.async.transfer %4 : !stream.resource<staging>{%c1} -> !stream.resource<transient>{%c1}
   // CHECK-NEXT: stream.yield %[[TRANSFER_H2D]]
   // CHECK: %[[READY_H2D:.+]] = stream.timepoint.await %[[TIMEPOINT_H2D]] => %[[RESULT_H2D]] : !stream.resource<transient>{%c1}
-  // CHECK: return %[[READY_H2D]]
-  return %5 : !stream.resource<transient>
+  // CHECK: util.return %[[READY_H2D]]
+  util.return %5 : !stream.resource<transient>
 }
 
 // -----
@@ -210,7 +210,7 @@
 // Tests that partitioning does not hoist ops across cf.asserts.
 
 // CHECK-LABEL: @dontHoistPastAsserts
-func.func @dontHoistPastAsserts(%arg0: !stream.resource<external>, %arg1: !stream.resource<external>) -> !stream.resource<external> {
+util.func public @dontHoistPastAsserts(%arg0: !stream.resource<external>, %arg1: !stream.resource<external>) -> !stream.resource<external> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -242,7 +242,7 @@
   // CHECK-NEXT: stream.async.dispatch @ex::@dispatch_2
   %6 = stream.async.dispatch @ex::@dispatch_2[%c1, %c1, %c1](%3[%c0 to %c1280 for %c1280], %5[%c0 to %c20 for %c20]) : (!stream.resource<transient>{%c1280}, !stream.resource<transient>{%c20}) -> !stream.resource<external>{%c20}
 
-  return %6 : !stream.resource<external>
+  util.return %6 : !stream.resource<external>
 }
 
 // -----
@@ -252,7 +252,7 @@
 // the cloned values will be exported to provide the value.
 
 // CHECK-LABEL: @cloneAcrossPartitions
-func.func @cloneAcrossPartitions(%cond: i1) -> (!stream.resource<external>, !stream.resource<transient>) {
+util.func public @cloneAcrossPartitions(%cond: i1) -> (!stream.resource<external>, !stream.resource<transient>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c123_i8 = arith.constant 123 : i8
@@ -281,8 +281,8 @@
   %result = stream.async.transfer %dispatch1 : !stream.resource<transient>{%c1} -> !stream.resource<external>{%c1}
   // CHECK: %[[PARTITION1:.+]] = stream.timepoint.await
 
-  // CHECK: return %[[PARTITION1]], %[[PARTITION0]]#1
-  return %result, %splat : !stream.resource<external>, !stream.resource<transient>
+  // CHECK: util.return %[[PARTITION1]], %[[PARTITION0]]#1
+  util.return %result, %splat : !stream.resource<external>, !stream.resource<transient>
 }
 
 // -----
@@ -293,7 +293,7 @@
 // tracking both the host and device hazards correctly.
 
 // CHECK-LABEL: @deviceHostDeviceCrossing
-func.func @deviceHostDeviceCrossing(%arg0: i1) -> !stream.resource<transient> {
+util.func public @deviceHostDeviceCrossing(%arg0: i1) -> !stream.resource<transient> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -314,8 +314,8 @@
   // CHECK-NEXT: stream.async.dispatch @ex::@dispatch2
   %4 = stream.async.dispatch @ex::@dispatch2[%c1, %c1, %c1](%1[%c0 to %c128 for %c128], %3[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
 
-  // CHECK: return
-  return %4 : !stream.resource<transient>
+  // CHECK: util.return
+  util.return %4 : !stream.resource<transient>
 }
 
 // -----
@@ -325,13 +325,13 @@
 stream.async.func private @inplaceExtern(%arg0: !stream.resource<*>, %arg1: index) -> %arg0
 
 // CHECK-LABEL: @inplaceCall
-func.func @inplaceCall(%arg0: !stream.resource<*>, %arg1: index, %arg2: index) -> (!stream.resource<*>, index) {
+util.func public @inplaceCall(%arg0: !stream.resource<*>, %arg1: index, %arg2: index) -> (!stream.resource<*>, index) {
   %c0 = arith.constant 0 : index
   // CHECK: stream.async.execute
   // CHECK-NEXT: stream.async.call
   %0 = stream.async.call @inplaceExtern(%arg0[%c0 to %arg1 for %arg1], %arg2) : (!stream.resource<*>{%arg1}, index) -> %arg0{%arg1}
   // CHECK: stream.timepoint.await
-  return %0, %arg1 : !stream.resource<*>, index
+  util.return %0, %arg1 : !stream.resource<*>, index
 }
 
 // -----
@@ -341,7 +341,7 @@
 stream.async.func private @inplaceExtern(%arg0: !stream.resource<*>, %arg1: index) -> %arg0
 
 // CHECK-LABEL: @scfRecurse
-func.func @scfRecurse(%arg0: !stream.resource<*>, %arg1: index, %arg2: index) -> (!stream.resource<*>, index) {
+util.func public @scfRecurse(%arg0: !stream.resource<*>, %arg1: index, %arg2: index) -> (!stream.resource<*>, index) {
   %c0 = arith.constant 0 : index
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
@@ -354,5 +354,5 @@
     // CHECK: stream.timepoint.await
     scf.yield %0 : !stream.resource<*>
   }
-  return %sum, %arg1 : !stream.resource<*>, index
+  util.return %sum, %arg1 : !stream.resource<*>, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_dispatches.mlir
index f8edc0f..4f5f931 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_dispatches.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_dispatches.mlir

@@ -8,8 +8,8 @@
 stream.executable private @specializeEx {
   stream.executable.export public @dispatch
   builtin.module  {
-    // CHECK: func.func @dispatch(%[[BINDING:.+]]: !stream.binding, %[[A:.+]]: i32, %[[SITE:.+]]: index)
-    func.func @dispatch(%binding: !stream.binding, %a: i32, %b: index, %c: i1, %d: i1) {
+    // CHECK:  util.func public @dispatch(%[[BINDING:.+]]: !stream.binding, %[[A:.+]]: i32, %[[SITE:.+]]: index)
+     util.func public @dispatch(%binding: !stream.binding, %a: i32, %b: index, %c: i1, %d: i1) {
       // CHECK-NEXT: %[[LUT_I32:.+]] = arith.constant dense<[
       // CHECK-SAME:   [20],
       // CHECK-SAME:   [40]
@@ -35,12 +35,12 @@
       util.optimization_barrier %c : i1
       // CHECK-NEXT: util.optimization_barrier %[[D]] : i1
       util.optimization_barrier %d : i1
-      return
+      util.return
     }
   }
 }
-// CHECK: func.func @specialize(%[[A:.+]]: i32)
-func.func @specialize(%a: i32) {
+// CHECK:  util.func public @specialize(%[[A:.+]]: i32)
+util.func public @specialize(%a: i32) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c20 = arith.constant 20 : index
@@ -58,5 +58,5 @@
       rw %capture[%c0 for %c20] : !stream.resource<transient>{%c20}
     }
   } => !stream.timepoint
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/verify_async_access_ranges.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/verify_async_access_ranges.mlir
index 1a4d361..49ae711 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/verify_async_access_ranges.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/verify_async_access_ranges.mlir

@@ -3,21 +3,21 @@
 // Tests that statically-known valid ranges pass verification.
 
 // CHECK: @inRangeCopy
-func.func @inRangeCopy(%source: !stream.resource<*>, %target: !stream.resource<*>) -> !stream.resource<*> {
+util.func public @inRangeCopy(%source: !stream.resource<*>, %target: !stream.resource<*>) -> !stream.resource<*> {
   %source_size = arith.constant 256 : index
   %target_size = arith.constant 256 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
   // CHECK: = stream.async.copy
   %0 = stream.async.copy %source[%c128 to %c256], %target[%c128 to %c256], %c128 : !stream.resource<*>{%source_size} -> %target as !stream.resource<*>{%target_size}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
 
 // Tests that statically-known invalid ranges emit errors.
 // For more useful reporting we report all errors on an op so this expects 2.
-func.func @outOfRangeCopy(%source: !stream.resource<*>, %target: !stream.resource<*>) -> !stream.resource<*> {
+util.func public @outOfRangeCopy(%source: !stream.resource<*>, %target: !stream.resource<*>) -> !stream.resource<*> {
   %source_size = arith.constant 256 : index
   %target_size = arith.constant 255 : index  // NOTE: too small!
   %c128 = arith.constant 128 : index
@@ -27,7 +27,7 @@
   // expected-error @+2 {{invalid Write access range [256 to 512 for 128] of resource %arg1 with size 255}}
   // expected-error @+1 {{invalid Write access range [256 to 512 for 128] of resource %0 with size 255}}
   %0 = stream.async.copy %source[%c128 to %c512], %target[%c256 to %c512], %c128 : !stream.resource<*>{%source_size} -> %target as !stream.resource<*>{%target_size}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }
 
 // -----
@@ -37,10 +37,10 @@
 // and this pass could verify the conditions (size of A < size of B, etc).
 
 // CHECK-LABEL: @dynamicSizes
-func.func @dynamicSizes(%source: !stream.resource<*>, %source_size: index, %target: !stream.resource<*>, %target_size: index) -> !stream.resource<*> {
+util.func public @dynamicSizes(%source: !stream.resource<*>, %source_size: index, %target: !stream.resource<*>, %target_size: index) -> !stream.resource<*> {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   // CHECK: = stream.async.copy
   %0 = stream.async.copy %source[%c0 to %c128], %target[%c0 to %c128], %c128 : !stream.resource<*>{%source_size} -> %target as !stream.resource<*>{%target_size}
-  return %0 : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Conversion/ConversionPatterns.cpp b/compiler/src/iree/compiler/Dialect/Util/Conversion/ConversionPatterns.cpp
index 041e6da..a07a41f 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Conversion/ConversionPatterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Conversion/ConversionPatterns.cpp

@@ -94,7 +94,79 @@
   }
 };
 
-struct ConvertFuncOp : public OpConversionPattern<mlir::func::FuncOp> {
+struct ConvertFuncOp : public OpConversionPattern<IREE::Util::FuncOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(IREE::Util::FuncOp funcOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto &typeConverter = *getTypeConverter();
+
+    // Convert the input signature types.
+    // TODO(benvanik): dynamic shapes by passing in tensor dynamic dims.
+    auto originalType = funcOp.getFunctionType();
+    TypeConverter::SignatureConversion newSignature(
+        originalType.getNumInputs());
+    for (auto argType : llvm::enumerate(originalType.getInputs())) {
+      if (failed(typeConverter.convertSignatureArg(
+              argType.index(), argType.value(), newSignature))) {
+        return rewriter.notifyMatchFailure(funcOp,
+                                           "failed to convert arg type");
+      }
+    }
+    SmallVector<Type> newResultTypes;
+    if (failed(typeConverter.convertTypes(originalType.getResults(),
+                                          newResultTypes))) {
+      return rewriter.notifyMatchFailure(funcOp,
+                                         "failed to convert result type");
+    }
+
+    // Replace function.
+    auto newFuncOp = rewriter.cloneWithoutRegions(funcOp);
+    newFuncOp.getBlocks().clear();
+    rewriter.inlineRegionBefore(funcOp.getFunctionBody(),
+                                newFuncOp.getFunctionBody(), newFuncOp.end());
+    newFuncOp.setType(rewriter.getFunctionType(newSignature.getConvertedTypes(),
+                                               newResultTypes));
+    if (failed(rewriter.convertRegionTypes(&newFuncOp.getFunctionBody(),
+                                           typeConverter, &newSignature))) {
+      return rewriter.notifyMatchFailure(funcOp,
+                                         "failed to convert region types");
+    }
+    rewriter.eraseOp(funcOp);
+    return success();
+  }
+};
+
+struct ConvertCallOp : public OpConversionPattern<IREE::Util::CallOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(IREE::Util::CallOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<Type> resultTypes;
+    if (failed(getTypeConverter()->convertTypes(op.getResultTypes(),
+                                                resultTypes))) {
+      return rewriter.notifyMatchFailure(op, "unable to convert result types");
+    }
+    auto newOp = rewriter.replaceOpWithNewOp<IREE::Util::CallOp>(
+        op, resultTypes, op.getCallee(), adaptor.getOperands(),
+        adaptor.getTiedOperandsAttr());
+    newOp->setDialectAttrs(op->getDialectAttrs());
+    return success();
+  }
+};
+
+struct ConvertReturnOp : public OpConversionPattern<IREE::Util::ReturnOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(IREE::Util::ReturnOp returnOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<IREE::Util::ReturnOp>(returnOp,
+                                                      adaptor.getOperands());
+    return success();
+  }
+};
+
+struct ConvertFuncFuncOp : public OpConversionPattern<mlir::func::FuncOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
   matchAndRewrite(mlir::func::FuncOp funcOp, OpAdaptor adaptor,
@@ -137,7 +209,7 @@
   }
 };
 
-struct ConvertCallOp : public OpConversionPattern<mlir::func::CallOp> {
+struct ConvertFuncCallOp : public OpConversionPattern<mlir::func::CallOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
   matchAndRewrite(mlir::func::CallOp op, OpAdaptor adaptor,
@@ -153,7 +225,7 @@
   }
 };
 
-struct ConvertReturnOp : public OpConversionPattern<mlir::func::ReturnOp> {
+struct ConvertFuncReturnOp : public OpConversionPattern<mlir::func::ReturnOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
   matchAndRewrite(mlir::func::ReturnOp returnOp, OpAdaptor adaptor,
@@ -259,28 +331,42 @@
   // We need to rewrite certain types on operands/results so use the default
   // dynamic legality checker to force any ops using such types to run through
   // our patterns.
+
   conversionTarget.addDynamicallyLegalOp<IREE::Util::InitializerOp>(
       [&](IREE::Util::InitializerOp op) {
         return typeConverter.isLegal(&op.getBody());
       });
-  conversionTarget.addDynamicallyLegalOp<mlir::func::FuncOp>(
-      [&](mlir::func::FuncOp op) {
+  conversionTarget.addDynamicallyLegalOp<IREE::Util::FuncOp>(
+      [&](IREE::Util::FuncOp op) {
         return typeConverter.isSignatureLegal(op.getFunctionType()) &&
                typeConverter.isLegal(&op.getBody());
       });
+  addGenericLegalOp<IREE::Util::CallOp>(conversionTarget, typeConverter);
+  addGenericLegalOp<IREE::Util::ReturnOp>(conversionTarget, typeConverter);
+  patterns.insert<ConvertInitializerOp, ConvertFuncOp, ConvertCallOp,
+                  ConvertReturnOp>(typeConverter, context);
+
+  conversionTarget.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
+    return typeConverter.isSignatureLegal(op.getFunctionType()) &&
+           typeConverter.isLegal(&op.getBody());
+  });
   addGenericLegalOp<func::CallOp>(conversionTarget, typeConverter);
   addGenericLegalOp<func::ReturnOp>(conversionTarget, typeConverter);
+  patterns.insert<ConvertFuncFuncOp, ConvertFuncCallOp, ConvertFuncReturnOp>(
+      typeConverter, context);
+
   addGenericLegalOp<cf::BranchOp>(conversionTarget, typeConverter);
   addGenericLegalOp<cf::CondBranchOp>(conversionTarget, typeConverter);
   addGenericLegalOp<cf::SwitchOp>(conversionTarget, typeConverter);
+  patterns.insert<ConvertBranchOp, ConvertCondBranchOp, ConvertSwitchOp>(
+      typeConverter, context);
+
   addGenericLegalOp<arith::SelectOp>(conversionTarget, typeConverter);
+  patterns.insert<ConvertSelectOp>(typeConverter, context);
+
   addGenericLegalOp<scf::IfOp>(conversionTarget, typeConverter);
   addGenericLegalOp<scf::YieldOp>(conversionTarget, typeConverter);
-  patterns
-      .insert<ConvertInitializerOp, ConvertFuncOp, ConvertCallOp,
-              ConvertReturnOp, ConvertBranchOp, ConvertCondBranchOp,
-              ConvertSwitchOp, ConvertSelectOp, ConvertIfOp, ConvertYieldOp>(
-          typeConverter, context);
+  patterns.insert<ConvertIfOp, ConvertYieldOp>(typeConverter, context);
 }
 
 } // namespace mlir::iree_compiler

diff --git a/compiler/src/iree/compiler/Dialect/Util/Conversion/MemRefToUtil/test/memref_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/Conversion/MemRefToUtil/test/memref_ops.mlir
index deb6cbd..745037b 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Conversion/MemRefToUtil/test/memref_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Conversion/MemRefToUtil/test/memref_ops.mlir

@@ -5,89 +5,89 @@
 // -----
 // Must be rank-0 or rank-1.
 // expected-error @-3 {{conversion to util failed}}
-func.func @verify_invalid_rank_2(%buffer: memref<4x2xf32>, %idx: index) -> f32{
+util.func @verify_invalid_rank_2(%buffer: memref<4x2xf32>, %idx: index) -> f32{
   // expected-error @below {{failed to legalize operation 'memref.load'}}
   %0 = memref.load %buffer[%idx, %idx] : memref<4x2xf32>
-  return %0 : f32
+  util.return %0 : f32
 }
 
 // -----
 // Must have an identity map.
 // expected-error @-3 {{conversion to util failed}}
 #map = affine_map<(d0)[s0] -> (d0 * s0)>
-func.func @verify_invalid_non_identity_map(%buffer: memref<4xf32, #map>, %idx: index) -> f32 {
+util.func @verify_invalid_non_identity_map(%buffer: memref<4xf32, #map>, %idx: index) -> f32 {
   // expected-error @below {{failed to legalize operation 'memref.load'}}
   %0 = memref.load %buffer[%idx] : memref<4xf32, #map>
-  return %0 : f32
+  util.return %0 : f32
 }
 
 // -----
 // CHECK-LABEL: @assume_alignment
-func.func @assume_alignment(%buffer: memref<?xf32>) {
+util.func @assume_alignment(%buffer: memref<?xf32>) {
   // CHECK-NOT: assume_alignment
   memref.assume_alignment %buffer, 64 : memref<?xf32>
-  func.return
+  util.return
 }
 
 // -----
 // CHECK-LABEL: @cast
-func.func @cast(%buffer: memref<?xf32>) -> memref<5xf32> {
+util.func @cast(%buffer: memref<?xf32>) -> memref<5xf32> {
   // CHECK-NOT: memref.cast
   %0 = memref.cast %buffer : memref<?xf32> to memref<5xf32>
-  // CHECK: return %arg0 : !util.buffer
-  func.return %0 : memref<5xf32>
+  // CHECK: util.return %arg0 : !util.buffer
+  util.return %0 : memref<5xf32>
 }
 
 // -----
 // CHECK-LABEL: @alloca() -> !util.buffer
-func.func @alloca() -> memref<16xi32> {
+util.func @alloca() -> memref<16xi32> {
   // CHECK: %[[ALLOCATION_SIZE:.+]] = arith.constant 64 : index
   // CHECK: %[[BUFFER:.+]] = util.buffer.alloc uninitialized : !util.buffer{%[[ALLOCATION_SIZE]]}
   %0 = memref.alloca() : memref<16xi32>
-  // CHECK: return %[[BUFFER]]
-  return %0 : memref<16xi32>
+  // CHECK: util.return %[[BUFFER]]
+  util.return %0 : memref<16xi32>
 }
 
 // -----
 // CHECK-LABEL: @alloca_dynamic_size
 // CHECK-SAME: (%[[LENGTH:.+]]: index)
-func.func @alloca_dynamic_size(%length : index) -> memref<?xi32> {
+util.func @alloca_dynamic_size(%length : index) -> memref<?xi32> {
   // CHECK: %[[ELEM_SIZE:.+]] = arith.constant 4 : index
   // CHECK: %[[ALLOCATION_SIZE:.+]] = arith.muli %[[LENGTH]], %[[ELEM_SIZE]] : index
   // CHECK: %[[BUFFER:.+]] = util.buffer.alloc uninitialized : !util.buffer{%[[ALLOCATION_SIZE]]}
   %0 = memref.alloca(%length) : memref<?xi32>
-  // CHECK: return %[[BUFFER]]
-  return %0 : memref<?xi32>
+  // CHECK: util.return %[[BUFFER]]
+  util.return %0 : memref<?xi32>
 }
 
 // -----
 // CHECK-LABEL: @alloc_i16
 // CHECK-SAME: (%[[IDX0:.+]]: index) -> !util.buffer {
-func.func @alloc_i16(%idx0: index) -> memref<4xi16> {
+util.func @alloc_i16(%idx0: index) -> memref<4xi16> {
   // CHECK: %[[C8:.*]] = arith.constant 8 : index
   // CHECK: %[[BUFFER:.*]] = util.buffer.alloc uninitialized : !util.buffer{%[[C8]]}
   %0 = memref.alloca() : memref<4xi16>
-  // CHECK: return %[[BUFFER]]
-  return %0 : memref<4xi16>
+  // CHECK: util.return %[[BUFFER]]
+  util.return %0 : memref<4xi16>
 }
 
 // -----
 // CHECK-LABEL: @alloc_index
 // CHECK-SAME: (%[[IDX0:.+]]: index) -> !util.buffer {
-func.func @alloc_index(%idx0: index) -> memref<4xindex> {
+util.func @alloc_index(%idx0: index) -> memref<4xindex> {
   // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
   // CHECK-DAG: %[[SIZEOF:.*]] = util.sizeof index
   // CHECK: %[[SZ:.*]] = arith.muli %[[SIZEOF]], %[[C4]]
   // CHECK: %[[BUFFER:.*]] = util.buffer.alloc uninitialized : !util.buffer{%[[SZ]]}
   %0 = memref.alloca() : memref<4xindex>
-  // CHECK: return %[[BUFFER]]
-  return %0 : memref<4xindex>
+  // CHECK: util.return %[[BUFFER]]
+  util.return %0 : memref<4xindex>
 }
 
 // -----
 // CHECK-LABEL: @load_store_f32
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer, %[[IDX0:.+]]: index, %[[IDX1:.+]]: index) -> f32 {
-func.func @load_store_f32(%buffer: memref<?xf32>, %idx0: index, %idx1: index) -> f32 {
+util.func @load_store_f32(%buffer: memref<?xf32>, %idx0: index, %idx1: index) -> f32 {
   // CHECK: %[[BUFFER_SIZE:.+]] = util.buffer.size %[[BUFFER]]
   // CHECK: %[[IDX0_BYTES:.+]] = arith.muli %[[IDX0]], %c4
   // CHECK: %[[VALUE:.+]] = util.buffer.load %[[BUFFER]][%[[IDX0_BYTES]] for %c4] : !util.buffer{%[[BUFFER_SIZE]]} -> f32
@@ -95,8 +95,8 @@
   // CHECK: %[[IDX1_BYTES:.+]] = arith.muli %[[IDX1]], %c4
   // CHECK: util.buffer.store %[[VALUE]], %[[BUFFER]][%[[IDX1_BYTES]] for %c4] : f32 -> !util.buffer{%[[BUFFER_SIZE]]}
   memref.store %0, %buffer[%idx1] : memref<?xf32>
-  // CHECK: return %[[VALUE]] : f32
-  return %0 : f32
+  // CHECK: util.return %[[VALUE]] : f32
+  util.return %0 : f32
 }
 
 // -----
@@ -108,21 +108,21 @@
 
 // CHECK-LABEL: @constant_global_f32
 // CHECK-SAME: (%[[IDX:.+]]: index) -> f32 {
-func.func @constant_global_f32(%idx: index) -> f32 {
+util.func @constant_global_f32(%idx: index) -> f32 {
   // CHECK: %[[BUFFER:.+]] = util.global.load @__constant_f32 : !util.buffer
   %0 = memref.get_global @__constant_f32 : memref<2xf32>
   // CHECK: %[[BUFFER_SIZE:.+]] = util.buffer.size %[[BUFFER]]
   // CHECK: %[[IDX_BYTES:.+]] = arith.muli %[[IDX]], %c4
   // CHECK: %[[VALUE:.+]] = util.buffer.load %[[BUFFER]][%[[IDX_BYTES]] for %c4] : !util.buffer{%[[BUFFER_SIZE]]} -> f32
   %1 = memref.load %0[%idx] : memref<2xf32>
-  // CHECK: return %[[VALUE]] : f32
-  return %1 : f32
+  // CHECK: util.return %[[VALUE]] : f32
+  util.return %1 : f32
 }
 
 // -----
 // CHECK-LABEL: @load_store_i16
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer, %[[IDX0:.+]]: index, %[[IDX1:.+]]: index, %[[VALUE:.+]]: i32) -> i32 {
-func.func @load_store_i16(%buffer: memref<?xi16>, %idx0: index, %idx1: index, %value: i16) -> i16 {
+util.func @load_store_i16(%buffer: memref<?xi16>, %idx0: index, %idx1: index, %value: i16) -> i16 {
   // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
   // CHECK-DAG: %[[SZ:.*]] = util.buffer.size %[[BUFFER]]
   // CHECK-DAG: %[[OFS0:.*]] = arith.muli %[[IDX0]], %[[C2]] : index
@@ -133,14 +133,14 @@
   // CHECK: %[[LD:.*]] = util.buffer.load %[[BUFFER]][%[[OFS1]] for %c2] : !util.buffer{%[[SZ]]} -> i16
   // CHECK: %[[UCST1:.*]] = builtin.unrealized_conversion_cast %[[LD]] : i16 to i32
   %1 = memref.load %buffer[%idx1] : memref<?xi16>
-  // CHECK: return %[[UCST1]]
-  return %1 : i16
+  // CHECK: util.return %[[UCST1]]
+  util.return %1 : i16
 }
 
 // -----
 // CHECK-LABEL: @load_store_index
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer, %[[IDX0:.+]]: index, %[[IDX1:.+]]: index, %[[VALUE:.+]]: index) -> index {
-func.func @load_store_index(%buffer: memref<?xindex>, %idx0: index, %idx1: index, %value: index) -> index {
+util.func @load_store_index(%buffer: memref<?xindex>, %idx0: index, %idx1: index, %value: index) -> index {
   // CHECK-DAG: %[[SIZEOF:.*]] = util.sizeof index
   // CHECK-DAG: %[[SZ:.*]] = util.buffer.size %[[BUFFER]]
   // CHECK-DAG: %[[OFS0:.*]] = arith.muli %[[SIZEOF]], %[[IDX0]] : index
@@ -149,30 +149,30 @@
   // CHECK: %[[OFS1:.*]] = arith.muli %[[SIZEOF]], %[[IDX1]] : index
   // CHECK: %[[LD:.*]] = util.buffer.load %[[BUFFER]][%[[OFS1]] for %[[SIZEOF]]] : !util.buffer{%[[SZ]]} -> index
   %1 = memref.load %buffer[%idx1] : memref<?xindex>
-  // CHECK: return %[[LD]]
-  return %1 : index
+  // CHECK: util.return %[[LD]]
+  util.return %1 : index
 }
 
 // -----
 // CHECK-LABEL: @dim_i16
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer, %[[IDX0:.+]]: index) -> index {
-func.func @dim_i16(%buffer: memref<?xi16>, %idx0: index) -> index {
+util.func @dim_i16(%buffer: memref<?xi16>, %idx0: index) -> index {
   // CHECK: %[[C2:.*]] = arith.constant 2 : index
   // CHECK: %[[SZ:.*]] = util.buffer.size %[[BUFFER]] : !util.buffer
   // CHECK: %[[DV:.*]] = arith.floordivsi %[[SZ]], %[[C2]] : index
   %0 = memref.dim %buffer, %idx0 : memref<?xi16>
-  // CHECK: return %[[DV]]
-  return %0 : index
+  // CHECK: util.return %[[DV]]
+  util.return %0 : index
 }
 
 // -----
 // CHECK-LABEL: @dim_index
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer, %[[IDX0:.+]]: index) -> index {
-func.func @dim_index(%buffer: memref<?xindex>, %idx0: index) -> index {
+util.func @dim_index(%buffer: memref<?xindex>, %idx0: index) -> index {
   // CHECK: %[[SIZEOF:.*]] = util.sizeof index
   // CHECK: %[[SZ:.*]] = util.buffer.size %[[BUFFER]] : !util.buffer
   // CHECK: %[[DV:.*]] = arith.floordivsi %[[SZ]], %[[SIZEOF]] : index
   %0 = memref.dim %buffer, %idx0 : memref<?xindex>
-  // CHECK: return %[[DV]]
-  return %0 : index
+  // CHECK: util.return %[[DV]]
+  util.return %0 : index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Conversion/test/structural_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/Conversion/test/structural_ops.mlir
index 357d55f..b815163 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Conversion/test/structural_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Conversion/test/structural_ops.mlir

@@ -5,69 +5,69 @@
 
 // CHECK: util.initializer
 util.initializer {
-  // CHECK: %[[VALUE:.+]] = func.call @extern
-  %value = func.call @extern() : () -> memref<?xi8>
+  // CHECK: %[[VALUE:.+]] = util.call @extern
+  %value = util.call @extern() : () -> memref<?xi8>
   // CHECK: cf.br ^bb1(%[[VALUE]] : !util.buffer)
   cf.br ^bb1(%value : memref<?xi8>)
 // CHECK: ^bb1(%[[ARG:.+]]: !util.buffer)
 ^bb1(%block_arg: memref<?xi8>):
   util.return
 }
-func.func private @extern() -> memref<?xi8>
+util.func private @extern() -> memref<?xi8>
 
 // -----
 
 // CHECK-LABEL: @funcOp
 // CHECK-SAME: (%[[ARG0:.+]]: !util.buffer) -> !util.buffer
-func.func @funcOp(%arg0: memref<?xi8>) -> memref<?xi8> {
-  // CHECK: return %[[ARG0]] : !util.buffer
-  return %arg0 : memref<?xi8>
+util.func public @funcOp(%arg0: memref<?xi8>) -> memref<?xi8> {
+  // CHECK: util.return %[[ARG0]] : !util.buffer
+  util.return %arg0 : memref<?xi8>
 }
 
 // -----
 
 // CHECK-LABEL: @callOp
 // CHECK-SAME: (%[[ARG0:.+]]: !util.buffer) -> !util.buffer
-func.func @callOp(%arg0: memref<?xi8>) -> memref<?xi8> {
-  // CHECK: %[[RET0:.+]] = call @extern(%[[ARG0]]) : (!util.buffer) -> !util.buffer
-  %ret0 = call @extern(%arg0) : (memref<?xi8>) -> memref<?xi8>
-  // CHECK: return %[[RET0]] : !util.buffer
-  return %ret0 : memref<?xi8>
+util.func public @callOp(%arg0: memref<?xi8>) -> memref<?xi8> {
+  // CHECK: %[[RET0:.+]] = util.call @extern(%[[ARG0]]) : (!util.buffer) -> !util.buffer
+  %ret0 = util.call @extern(%arg0) : (memref<?xi8>) -> memref<?xi8>
+  // CHECK: util.return %[[RET0]] : !util.buffer
+  util.return %ret0 : memref<?xi8>
 }
-// CHECK: func.func private @extern(!util.buffer) -> !util.buffer
-func.func private @extern(memref<?xi8>) -> memref<?xi8>
+// CHECK: util.func private @extern(%arg0: !util.buffer) -> !util.buffer
+util.func private @extern(memref<?xi8>) -> memref<?xi8>
 
 // -----
 
 // CHECK-LABEL: @brOp
 // CHECK-SAME: (%[[ARG0:.+]]: !util.buffer) -> !util.buffer
-func.func @brOp(%arg0: memref<?xi8>) -> memref<?xi8> {
+util.func public @brOp(%arg0: memref<?xi8>) -> memref<?xi8> {
   // CHECK: cf.br ^bb1(%[[ARG0]] : !util.buffer)
   cf.br ^bb1(%arg0 : memref<?xi8>)
 // CHECK: ^bb1(%[[BB1_ARG0:.+]]: !util.buffer):
 ^bb1(%bb1_arg0: memref<?xi8>):
-  // CHECK: return %[[BB1_ARG0]] : !util.buffer
-  return %bb1_arg0 : memref<?xi8>
+  // CHECK: util.return %[[BB1_ARG0]] : !util.buffer
+  util.return %bb1_arg0 : memref<?xi8>
 }
 
 // -----
 
 // CHECK-LABEL: @condBrOp
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[ARG0:.+]]: !util.buffer, %[[ARG1:.+]]: !util.buffer) -> !util.buffer
-func.func @condBrOp(%cond: i1, %arg0: memref<?xi8>, %arg1: memref<?xi8>) -> memref<?xi8> {
+util.func public @condBrOp(%cond: i1, %arg0: memref<?xi8>, %arg1: memref<?xi8>) -> memref<?xi8> {
   // CHECK: cf.cond_br %[[COND]], ^bb1(%[[ARG0]] : !util.buffer), ^bb1(%[[ARG1]] : !util.buffer)
   cf.cond_br %cond, ^bb1(%arg0 : memref<?xi8>), ^bb1(%arg1 : memref<?xi8>)
 // CHECK: ^bb1(%[[BB1_ARG0:.+]]: !util.buffer):
 ^bb1(%bb1_arg0 : memref<?xi8>):
-  // CHECK: return %[[BB1_ARG0]] : !util.buffer
-  return %bb1_arg0 : memref<?xi8>
+  // CHECK: util.return %[[BB1_ARG0]] : !util.buffer
+  util.return %bb1_arg0 : memref<?xi8>
 }
 
 // -----
 
 // CHECK-LABEL: @switchOp
 // CHECK-SAME: (%[[FLAG:.+]]: i32, %[[ARG0:.+]]: !util.buffer, %[[ARG1:.+]]: !util.buffer) -> !util.buffer
-func.func @switchOp(%flag: i32, %arg0: memref<?xi8>, %arg1: memref<?xi8>) -> memref<?xi8> {
+util.func public @switchOp(%flag: i32, %arg0: memref<?xi8>, %arg1: memref<?xi8>) -> memref<?xi8> {
   // CHECK: cf.switch %[[FLAG]] : i32, [
   // CHECK:   default: ^bb1(%[[ARG0]] : !util.buffer),
   // CHECK:   0: ^bb1(%[[ARG1]] : !util.buffer)
@@ -78,26 +78,26 @@
   ]
 // CHECK: ^bb1(%[[BB1_ARG0:.+]]: !util.buffer):
 ^bb1(%bb1_arg0 : memref<?xi8>):
-  // CHECK: return %[[BB1_ARG0]] : !util.buffer
-  return %bb1_arg0 : memref<?xi8>
+  // CHECK: util.return %[[BB1_ARG0]] : !util.buffer
+  util.return %bb1_arg0 : memref<?xi8>
 }
 
 // -----
 
 // CHECK-LABEL: @selectOp
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[ARG0:.+]]: !util.buffer, %[[ARG1:.+]]: !util.buffer) -> !util.buffer
-func.func @selectOp(%cond: i1, %arg0: memref<?xi8>, %arg1: memref<?xi8>) -> memref<?xi8> {
+util.func public @selectOp(%cond: i1, %arg0: memref<?xi8>, %arg1: memref<?xi8>) -> memref<?xi8> {
   // CHECK: %[[RET0:.+]] = arith.select %[[COND]], %[[ARG0]], %[[ARG1]] : !util.buffer
   %ret0 = arith.select %cond, %arg0, %arg1 : memref<?xi8>
-  // CHECK: return %[[RET0]] : !util.buffer
-  return %ret0 : memref<?xi8>
+  // CHECK: util.return %[[RET0]] : !util.buffer
+  util.return %ret0 : memref<?xi8>
 }
 
 // -----
 
 // CHECK-LABEL: @ifOp
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[ARG0:.+]]: !util.buffer, %[[ARG1:.+]]: !util.buffer) -> !util.buffer
-func.func @ifOp(%cond: i1, %arg0: memref<?xi8>, %arg1: memref<?xi8>) -> memref<?xi8> {
+util.func public @ifOp(%cond: i1, %arg0: memref<?xi8>, %arg1: memref<?xi8>) -> memref<?xi8> {
   // CHECK: %[[RET0:.+]] = scf.if %[[COND]] -> (!util.buffer)
   %ret0 = scf.if %cond -> (memref<?xi8>) {
     // CHECK: scf.yield %[[ARG0]] : !util.buffer
@@ -106,6 +106,6 @@
     // CHECK: scf.yield %[[ARG1]] : !util.buffer
     scf.yield %arg1 : memref<?xi8>
   }
-  // CHECK: return %[[RET0]] : !util.buffer
-  return %ret0 : memref<?xi8>
+  // CHECK: util.return %[[RET0]] : !util.buffer
+  util.return %ret0 : memref<?xi8>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilDialect.cpp b/compiler/src/iree/compiler/Dialect/Util/IR/UtilDialect.cpp
index c1900fe..9dc9b18 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilDialect.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilDialect.cpp

@@ -112,9 +112,13 @@
 
 UtilDialect::UtilDialect(MLIRContext *context)
     : Dialect(getDialectNamespace(), context, TypeID::get<UtilDialect>()) {
+  context->loadDialect<arith::ArithDialect>();
+
   addInterfaces<UtilOpAsmInterface, UtilInlinerInterface>();
+
   registerAttributes();
   registerTypes();
+
 #define GET_OP_LIST
   addOperations<
 #include "iree/compiler/Dialect/Util/IR/UtilOps.cpp.inc"

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilExternalModels.cpp b/compiler/src/iree/compiler/Dialect/Util/IR/UtilExternalModels.cpp
index 5e39aad..691e5d5 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilExternalModels.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilExternalModels.cpp

@@ -18,88 +18,9 @@
 
 namespace {
 
-// Since all details of the interface are provided via default implementations,
-// we can just have one templated external model to apply per op, vs one
-// explicit model per op.
-struct GenericNumericCastExternalModel {
-  template <typename OpTy>
-  struct ExternalModel
-      : public NumericCastOpInterface::ExternalModel<ExternalModel<OpTy>,
-                                                     OpTy> {};
-
-  template <typename OpTy>
-  static void add(MLIRContext *ctx) {
-    OpTy::template attachInterface<ExternalModel<OpTy>>(*ctx);
-  }
-
-  template <typename OpTy1, typename OpTy2, typename... More>
-  static void add(MLIRContext *ctx) {
-    add<OpTy1>(ctx);
-    add<OpTy2, More...>(ctx);
-  }
-};
-
-struct InsertSliceOpTiedOpInterface
-    : public TiedOpInterface::ExternalModel<InsertSliceOpTiedOpInterface,
-                                            tensor::InsertSliceOp> {
-  Value getTiedResult(Operation *op, unsigned resultIndex) const {
-    auto insertSliceOp = cast<tensor::InsertSliceOp>(op);
-    return IREE::Util::TiedOpInterface::findTiedBaseValue(
-        insertSliceOp.getDest());
-  }
-
-  ::std::optional<unsigned>
-  getTiedResultOperandIndex(Operation *op, unsigned resultIndex) const {
-    return {1}; // dest
-  }
-
-  SmallVector<int64_t> getTiedResultOperandIndices(Operation *op) const {
-    return {1}; // dest
-  }
-};
-
-template <typename OpTy>
-struct LinalgOpTiedOpInterface
-    : public TiedOpInterface::ExternalModel<LinalgOpTiedOpInterface<OpTy>,
-                                            OpTy> {
-  Value getTiedResult(Operation *op, unsigned resultIndex) const {
-    auto linalgOp = cast<OpTy>(op);
-    return IREE::Util::TiedOpInterface::findTiedBaseValue(
-        linalgOp.getDpsInits()[resultIndex]);
-  }
-
-  ::std::optional<unsigned>
-  getTiedResultOperandIndex(Operation *op, unsigned resultIndex) const {
-    auto linalgOp = cast<OpTy>(op);
-    return {linalgOp.getDpsInitsMutable()[resultIndex].getOperandNumber()};
-  }
-
-  SmallVector<int64_t> getTiedResultOperandIndices(Operation *op) const {
-    SmallVector<int64_t> result;
-    for (unsigned i = 0; i < op->getNumResults(); ++i)
-      result.push_back(*getTiedResultOperandIndex(op, i));
-    return result;
-  }
-};
-
-/// Helper structure that iterates over all LinalgOps in `OpTys` and registers
-/// the `TiedOpInterface` with each of them.
-template <typename... Ops>
-struct LinalgOpTiedOpInterfaceHelper {
-  static void registerOpInterface(MLIRContext *ctx) {
-    (void)std::initializer_list<int>{
-        0, (Ops::template attachInterface<LinalgOpTiedOpInterface<Ops>>(*ctx),
-            0)...};
-  }
-};
-
 struct GlobalOpInterfaceExternalModel
     : public GlobalOpInterface::ExternalModel<GlobalOpInterfaceExternalModel,
                                               ml_program::GlobalOp> {
-  static void add(MLIRContext *ctx) {
-    ml_program::GlobalOp::attachInterface<GlobalOpInterfaceExternalModel>(*ctx);
-  }
-
   Attribute getGlobalInitialValue(Operation *op) const {
     return cast<ml_program::GlobalOp>(op).getValueAttr();
   }
@@ -157,61 +78,146 @@
   }
 };
 
+// Since all details of the interface are provided via default implementations,
+// we can just have one templated external model to apply per op, vs one
+// explicit model per op.
+struct GenericNumericCastExternalModel {
+  template <typename OpTy>
+  struct ExternalModel
+      : public NumericCastOpInterface::ExternalModel<ExternalModel<OpTy>,
+                                                     OpTy> {};
+
+  template <typename OpTy>
+  static void add(MLIRContext *context) {
+    OpTy::template attachInterface<ExternalModel<OpTy>>(*context);
+  }
+
+  template <typename OpTy1, typename OpTy2, typename... More>
+  static void add(MLIRContext *context) {
+    add<OpTy1>(context);
+    add<OpTy2, More...>(context);
+  }
+};
+
+struct InsertSliceOpTiedOpInterface
+    : public TiedOpInterface::ExternalModel<InsertSliceOpTiedOpInterface,
+                                            tensor::InsertSliceOp> {
+  Value getTiedResult(Operation *op, unsigned resultIndex) const {
+    auto insertSliceOp = cast<tensor::InsertSliceOp>(op);
+    return IREE::Util::TiedOpInterface::findTiedBaseValue(
+        insertSliceOp.getDest());
+  }
+
+  ::std::optional<unsigned>
+  getTiedResultOperandIndex(Operation *op, unsigned resultIndex) const {
+    return {1}; // dest
+  }
+
+  SmallVector<int64_t> getTiedResultOperandIndices(Operation *op) const {
+    return {1}; // dest
+  }
+};
+
+template <typename OpTy>
+struct LinalgOpTiedOpInterface
+    : public TiedOpInterface::ExternalModel<LinalgOpTiedOpInterface<OpTy>,
+                                            OpTy> {
+  Value getTiedResult(Operation *op, unsigned resultIndex) const {
+    auto linalgOp = cast<OpTy>(op);
+    return IREE::Util::TiedOpInterface::findTiedBaseValue(
+        linalgOp.getDpsInits()[resultIndex]);
+  }
+
+  ::std::optional<unsigned>
+  getTiedResultOperandIndex(Operation *op, unsigned resultIndex) const {
+    auto linalgOp = cast<OpTy>(op);
+    return {linalgOp.getDpsInitsMutable()[resultIndex].getOperandNumber()};
+  }
+
+  SmallVector<int64_t> getTiedResultOperandIndices(Operation *op) const {
+    SmallVector<int64_t> result;
+    for (unsigned i = 0; i < op->getNumResults(); ++i)
+      result.push_back(*getTiedResultOperandIndex(op, i));
+    return result;
+  }
+};
+
+/// Helper structure that iterates over all LinalgOps in `OpTys` and registers
+/// the `TiedOpInterface` with each of them.
+template <typename... Ops>
+struct LinalgOpTiedOpInterfaceHelper {
+  static void registerOpInterface(MLIRContext *context) {
+    (void)std::initializer_list<int>{
+        0,
+        (Ops::template attachInterface<LinalgOpTiedOpInterface<Ops>>(*context),
+         0)...};
+  }
+};
+
 } // namespace
 
 void registerUtilExternalModels(DialectRegistry &registry) {
   // Must ensure that any dependent dialects are registered.
-  registry.insert<arith::ArithDialect, linalg::LinalgDialect,
-                  ml_program::MLProgramDialect, tensor::TensorDialect>();
+  registry.insert<arith::ArithDialect>();
+  registry.insert<linalg::LinalgDialect>();
+  registry.insert<ml_program::MLProgramDialect>();
+  registry.insert<tensor::TensorDialect>();
 
-  registry.addExtension(+[](MLIRContext *ctx,
-                            ml_program::MLProgramDialect *dialect) {
-    ml_program::GlobalOp::attachInterface<GlobalOpInterfaceExternalModel>(*ctx);
-  });
+  registry.addExtension(
+      +[](MLIRContext *context, ml_program::MLProgramDialect *dialect) {
+        ml_program::GlobalOp::attachInterface<GlobalOpInterfaceExternalModel>(
+            *context);
+      });
 
-  registry.addExtension(+[](MLIRContext *ctx, arith::ArithDialect *dialect) {
+  registry.addExtension(+[](MLIRContext *context,
+                            arith::ArithDialect *dialect) {
     GenericNumericCastExternalModel::add<
         arith::BitcastOp, arith::ExtFOp, arith::ExtUIOp, arith::ExtSIOp,
         arith::FPToSIOp, arith::FPToUIOp, arith::IndexCastOp, arith::TruncFOp,
-        arith::TruncIOp, arith::SIToFPOp, arith::UIToFPOp>(ctx);
+        arith::TruncIOp, arith::SIToFPOp, arith::UIToFPOp>(context);
   });
 
-  registry.addExtension(+[](MLIRContext *ctx, tensor::TensorDialect *dialect) {
-    tensor::InsertSliceOp::attachInterface<InsertSliceOpTiedOpInterface>(*ctx);
-  });
+  registry.addExtension(
+      +[](MLIRContext *context, tensor::TensorDialect *dialect) {
+        tensor::InsertSliceOp::attachInterface<InsertSliceOpTiedOpInterface>(
+            *context);
+      });
 
-  registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) {
-    // Register all Linalg structured ops. `LinalgOp` is an interface and it is
-    // not possible to attach an external interface to an existing interface.
-    // Therefore, attach the `TiedOpInterface` to all ops one-by-one.
-    LinalgOpTiedOpInterfaceHelper<
+  registry.addExtension(
+      +[](MLIRContext *context, linalg::LinalgDialect *dialect) {
+        // Register all Linalg structured ops. `LinalgOp` is an interface and it
+        // is not possible to attach an external interface to an existing
+        // interface. Therefore, attach the `TiedOpInterface` to all ops
+        // one-by-one.
+        LinalgOpTiedOpInterfaceHelper<
 #define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
-        >::registerOpInterface(ctx);
-  });
+            >::registerOpInterface(context);
+      });
 
   // TODO(matthias-springer): Use a helper instead of listing all ops. This is
   // tricky because LinalgExtOps.td includes YieldOp.
-  registry.addExtension(+[](MLIRContext *ctx,
+  registry.addExtension(+[](MLIRContext *context,
                             LinalgExt::IREELinalgExtDialect *dialect) {
     LinalgExt::ScatterOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::ScatterOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::ScatterOp>>(*context);
     LinalgExt::SortOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::SortOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::SortOp>>(*context);
     LinalgExt::FftOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::FftOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::FftOp>>(*context);
     LinalgExt::ScanOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::ScanOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::ScanOp>>(*context);
     LinalgExt::ReverseOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::ReverseOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::ReverseOp>>(*context);
     LinalgExt::TopkOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::TopkOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::TopkOp>>(*context);
     LinalgExt::WinogradInputTransformOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::WinogradInputTransformOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::WinogradInputTransformOp>>(*context);
     LinalgExt::WinogradOutputTransformOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::WinogradOutputTransformOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::WinogradOutputTransformOp>>(
+        *context);
     LinalgExt::AttentionOp::attachInterface<
-        LinalgOpTiedOpInterface<LinalgExt::AttentionOp>>(*ctx);
+        LinalgOpTiedOpInterface<LinalgExt::AttentionOp>>(*context);
   });
 }
 

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilInterfaces.td b/compiler/src/iree/compiler/Dialect/Util/IR/UtilInterfaces.td
index 3b9e0ad..83a6bd8 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilInterfaces.td
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilInterfaces.td

@@ -723,6 +723,17 @@
   let methods = [
     InterfaceMethod<
       /*desc=*/[{
+      }],
+      /*retTy=*/"void",
+      /*methodName=*/"getAllTiedOperands",
+      /*args=*/(ins "SmallVectorImpl<int64_t> &":$indices),
+      /*methodBody=*/[{}],
+      /*defaultImplementation=*/[{
+        IREE::Util::detail::getAllTiedOperands($_op, indices);
+      }]
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
         Returns the set of operands that results may be tied to as an
         (index, length) pair ala getODSOperandIndexAndLength.
 
@@ -733,8 +744,9 @@
         of successor operands.
       }],
       /*retTy=*/"std::pair<unsigned, unsigned>",
-      /*methodName=*/"getTiedOperandsIndexAndLength", (ins),
-      /*args=*/[{}],
+      /*methodName=*/"getTiedOperandsIndexAndLength",
+      /*args=*/(ins),
+      /*methodBody=*/[{}],
       /*defaultImplementation=*/[{
         return {0, $_op->getNumOperands()};
       }]
@@ -749,8 +761,9 @@
         ones it will tie.
       }],
       /*retTy=*/"std::pair<unsigned, unsigned>",
-      /*methodName=*/"getTiedResultsIndexAndLength", (ins),
-      /*args=*/[{}],
+      /*methodName=*/"getTiedResultsIndexAndLength",
+      /*args=*/(ins),
+      /*methodBody=*/[{}],
       /*defaultImplementation=*/[{
         return {0, $_op->getNumResults()};
       }]

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.cpp b/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.cpp
index 75ea2c3..bc59c87 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.cpp

@@ -327,12 +327,16 @@
                                  SmallVectorImpl<Type> &operandTypes) {
   if (failed(parser.parseLParen()))
     return failure();
-  while (!succeeded(parser.parseOptionalRParen())) {
+  if (succeeded(parser.parseOptionalRParen()))
+    return success(); // empty
+  do {
     Type type;
     if (failed(parser.parseType(type)))
       return failure();
     operandTypes.push_back(type);
-  }
+  } while (succeeded(parser.parseOptionalComma()));
+  if (failed(parser.parseRParen()))
+    return failure();
   return success();
 }
 
@@ -1262,8 +1266,9 @@
   OpBuilder builder(location->getContext());
   OperationState state(location, getOperationName());
   FuncOp::build(builder, state, name, type,
-                builder.getIndexArrayAttr(tiedOperands), attrs, argAttrs,
-                resAttrs);
+                tiedOperands.empty() ? ArrayAttr{}
+                                     : builder.getIndexArrayAttr(tiedOperands),
+                attrs, argAttrs, resAttrs);
   return cast<FuncOp>(Operation::create(state));
 }
 
@@ -1275,12 +1280,14 @@
   state.addAttribute(SymbolTable::getSymbolAttrName(),
                      builder.getStringAttr(name));
   state.addAttribute(SymbolTable::getVisibilityAttrName(),
-                     builder.getStringAttr("private"));
+                     builder.getStringAttr("public"));
   state.addAttribute("function_type", TypeAttr::get(type));
   state.attributes.append(attrs.begin(), attrs.end());
   state.attributes.erase(IREE::Util::TiedOpInterface::getStorageAttrName());
-  state.addAttribute(IREE::Util::TiedOpInterface::getStorageAttrName(),
-                     tiedOperands);
+  if (tiedOperands) {
+    state.addAttribute(IREE::Util::TiedOpInterface::getStorageAttrName(),
+                       tiedOperands);
+  }
   state.addRegion();
   if (!argAttrs.empty() || !resAttrs.empty()) {
     assert(type.getNumInputs() == argAttrs.size());
@@ -1417,6 +1424,72 @@
   }
 }
 
+bool IREE::Util::FuncOp::hasAnyTiedOperands() {
+  auto tiedOperandsAttr = getTiedOperandsAttr();
+  if (!tiedOperandsAttr)
+    return false;
+  return llvm::any_of(
+      tiedOperandsAttr.getAsRange<IntegerAttr>(), [](IntegerAttr attr) {
+        return attr.getInt() != IREE::Util::TiedOpInterface::kUntiedIndex;
+      });
+}
+
+void IREE::Util::FuncOp::expandSignature(
+    std::function<void(unsigned, Type, SmallVectorImpl<Type> &)> expandArgument,
+    std::function<void(unsigned, Type, SmallVectorImpl<Type> &)> expandResult) {
+  auto oldType = getFunctionType();
+
+  SmallVector<DictionaryAttr> oldArgumentAttrs;
+  getAllArgAttrs(oldArgumentAttrs);
+  SmallVector<DictionaryAttr> oldResultAttrs;
+  getAllResultAttrs(oldResultAttrs);
+
+  SmallVector<int64_t> adjustedTiedOperands;
+  IREE::Util::detail::getAllTiedOperands(getOperation(), adjustedTiedOperands);
+
+  SmallVector<Type> newArgumentTypes;
+  SmallVector<DictionaryAttr> newArgumentAttrs;
+  for (auto [oldIndex, argType] : llvm::enumerate(oldType.getInputs())) {
+    size_t newIndex = newArgumentTypes.size();
+    expandArgument(oldIndex, argType, newArgumentTypes);
+    size_t expandedCount = newArgumentTypes.size() - newIndex;
+    for (size_t i = 0; i < adjustedTiedOperands.size(); ++i) {
+      if (adjustedTiedOperands[i] == oldIndex)
+        adjustedTiedOperands[i] = newIndex;
+    }
+    newArgumentAttrs.push_back(oldArgumentAttrs[oldIndex]);
+    newArgumentAttrs.append(expandedCount - 1,
+                            DictionaryAttr::get(getContext()));
+  }
+
+  SmallVector<Type> newResultTypes;
+  SmallVector<int64_t> newTiedOperands;
+  SmallVector<DictionaryAttr> newResultAttrs;
+  for (auto [oldIndex, resultType] : llvm::enumerate(oldType.getResults())) {
+    size_t newIndex = newResultTypes.size();
+    expandResult(oldIndex, resultType, newResultTypes);
+    size_t expandedCount = newResultTypes.size() - newIndex;
+    newTiedOperands.push_back(adjustedTiedOperands[oldIndex]);
+    newTiedOperands.append(expandedCount - 1,
+                           IREE::Util::TiedOpInterface::kUntiedIndex);
+    newResultAttrs.push_back(oldResultAttrs[oldIndex]);
+    newResultAttrs.append(expandedCount - 1, DictionaryAttr::get(getContext()));
+  }
+
+  auto newType =
+      FunctionType::get(getContext(), newArgumentTypes, newResultTypes);
+  if (newType != oldType) {
+    setFunctionType(newType);
+    setTiedOperandsAttr(ArrayAttr::get(
+        getContext(),
+        llvm::map_to_vector<8>(newTiedOperands, [&](int64_t v) -> Attribute {
+          return IntegerAttr::get(IndexType::get(getContext()), v);
+        })));
+    setAllArgAttrs(newArgumentAttrs);
+    setAllResultAttrs(newResultAttrs);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // util.call
 //===----------------------------------------------------------------------===//
@@ -1425,6 +1498,24 @@
   return FunctionType::get(getContext(), getOperandTypes(), getResultTypes());
 }
 
+static bool areTiedOperandsEqual(ArrayAttr a, ArrayAttr b) {
+  auto hasAnyTied = [](ArrayAttr tiedOperandsAttr) {
+    if (!tiedOperandsAttr)
+      return false;
+    return llvm::any_of(
+        tiedOperandsAttr.getAsRange<IntegerAttr>(), [](IntegerAttr attr) {
+          return attr.getInt() != IREE::Util::TiedOpInterface::kUntiedIndex;
+        });
+  };
+  bool hasAnyTiedA = hasAnyTied(a);
+  bool hasAnyTiedB = hasAnyTied(b);
+  if (hasAnyTiedA != hasAnyTiedB)
+    return false;
+  if (!a || !b)
+    return true;
+  return a == b;
+}
+
 LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   Operation *op = getOperation();
 
@@ -1444,16 +1535,50 @@
   }
 
   // Ensure tied operands are consistent.
-  auto expectedTiedOperands = getTiedOperandsAttr();
+  auto callerTiedOperands = getTiedOperandsAttr();
   auto calleeTiedOperands = calleeOp.getTiedOperandsAttr();
-  if (calleeTiedOperands != expectedTiedOperands) {
-    return emitOpError("function tied operands mismatch; expected ")
-           << expectedTiedOperands << " but callee is " << calleeTiedOperands;
+  if (!areTiedOperandsEqual(calleeTiedOperands, callerTiedOperands)) {
+    return emitOpError("function tied operands mismatch; have ")
+           << callerTiedOperands << " but callee is " << calleeTiedOperands;
   }
 
   return success();
 }
 
+IREE::Util::CallOp IREE::Util::CallOp::cloneAndExpand(
+    std::function<void(unsigned, Value, SmallVectorImpl<Value> &)>
+        expandOperand,
+    std::function<void(unsigned, Type, SmallVectorImpl<Type> &)> expandResult,
+    OpBuilder &builder) {
+  SmallVector<int64_t> adjustedTiedOperands;
+  IREE::Util::detail::getAllTiedOperands(getOperation(), adjustedTiedOperands);
+
+  SmallVector<Value> newOperands;
+  for (auto [oldIndex, operand] : llvm::enumerate(getOperands())) {
+    size_t newIndex = newOperands.size();
+    expandOperand(oldIndex, operand, newOperands);
+    for (size_t i = 0; i < adjustedTiedOperands.size(); ++i) {
+      if (adjustedTiedOperands[i] == oldIndex)
+        adjustedTiedOperands[i] = newIndex;
+    }
+  }
+
+  SmallVector<Type> newResultTypes;
+  SmallVector<int64_t> newTiedOperands;
+  for (auto [oldIndex, resultType] : llvm::enumerate(getResultTypes())) {
+    size_t newIndex = newResultTypes.size();
+    expandResult(oldIndex, resultType, newResultTypes);
+    size_t expandedCount = newResultTypes.size() - newIndex;
+    newTiedOperands.push_back(adjustedTiedOperands[oldIndex]);
+    newTiedOperands.append(expandedCount - 1,
+                           IREE::Util::TiedOpInterface::kUntiedIndex);
+  }
+
+  return builder.create<IREE::Util::CallOp>(
+      getLoc(), newResultTypes, getCallee(), newOperands,
+      builder.getIndexArrayAttr(newTiedOperands));
+}
+
 //===----------------------------------------------------------------------===//
 // util.return
 //===----------------------------------------------------------------------===//

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.td b/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.td
index 823558d..bf9f7a8 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.td
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilOps.td

@@ -634,6 +634,22 @@
 
     ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
     ArrayRef<Type> getResultTypes() { return getFunctionType().getResults(); }
+
+    // Returns true if any operand is tied to a result.
+    bool hasAnyTiedOperands();
+
+    // Updates the function signature to potentially expand each argument and
+    // result. Only the signature and the metadata on the function (tied
+    // operands, argument/result attrs, etc) are updated and the body region
+    // remains unchanged.
+    //
+    // Any type that may be tied must remain in the same relative order (expand
+    // by appending types after the base type).
+    //
+    // If |newSignature| is provided
+    void expandSignature(
+        std::function<void(unsigned, Type, SmallVectorImpl<Type> &)> expandArgument,
+        std::function<void(unsigned, Type, SmallVectorImpl<Type> &)> expandResult);
   }];
 
   let hasCustomAssemblyFormat = 1;
@@ -671,6 +687,19 @@
     Variadic<AnyType>:$results
   );
 
+  let builders = [
+    OpBuilder<(ins
+      CArg<"FunctionOpInterface">:$callee,
+      CArg<"ValueRange">:$operands,
+      CArg<"ArrayAttr", "{}">:$tied_operands,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs
+    ), [{
+      build($_builder, $_state, callee.getResultTypes(), callee.getName(),
+            operands, tied_operands);
+      $_state.addAttributes(attrs);
+    }]>,
+  ];
+
   let assemblyFormat = [{
     $callee `(` $operands `)`
     attr-dict `:`
@@ -696,6 +725,15 @@
     void setCalleeFromCallable(CallInterfaceCallable callee) {
       (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
     }
+
+    // Clones the call and potentially expands each operand and result.
+    // Callers can then replace result uses using the returned op.
+    // Any type that may be tied must remain in the same relative order (expand
+    // by appending types after the base type).
+    IREE::Util::CallOp cloneAndExpand(
+        std::function<void(unsigned, Value, SmallVectorImpl<Value> &)> expandOperand,
+        std::function<void(unsigned, Type, SmallVectorImpl<Type> &)> expandResult,
+        OpBuilder &builder);
   }];
 }
 

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.cpp b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.cpp
index d9c4a92..3d0d28e 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.cpp

@@ -228,6 +228,17 @@
   return false;
 }
 
+bool isPublicOrExternal(CallableOpInterface callableOp) {
+  if (auto symbolOp = dyn_cast<SymbolOpInterface>(callableOp.getOperation())) {
+    if (symbolOp.isPublic())
+      return true;
+  }
+  auto *region = callableOp.getCallableRegion();
+  if (!region || region->empty())
+    return true;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Global and structural interface utilities
 //===----------------------------------------------------------------------===//
@@ -350,6 +361,22 @@
 // IREE::Util::TiedOpInterface
 //===----------------------------------------------------------------------===//
 
+void detail::getAllTiedOperands(Operation *op,
+                                SmallVectorImpl<int64_t> &indices) {
+  if (auto tiedOperandsAttr = op->getAttrOfType<ArrayAttr>(
+          IREE::Util::TiedOpInterface::getStorageAttrName())) {
+    for (auto indexAttr : tiedOperandsAttr.getAsRange<IntegerAttr>()) {
+      indices.push_back(indexAttr.getInt());
+    }
+  } else if (auto tiedOp = dyn_cast<IREE::Util::TiedOpInterface>(op)) {
+    indices.assign(op->getNumResults(),
+                   IREE::Util::TiedOpInterface::kUntiedIndex);
+  } else if (auto callableOp = dyn_cast<CallableOpInterface>(op)) {
+    indices.assign(callableOp.getResultTypes().size(),
+                   IREE::Util::TiedOpInterface::kUntiedIndex);
+  }
+}
+
 std::optional<unsigned>
 detail::getTiedResultOperandIndex(Operation *op, unsigned resultIndex) {
   auto storageAttr = op->getAttrOfType<ArrayAttr>(

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h
index 230c87d..5ac353e 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/UtilTypes.h

@@ -20,6 +20,7 @@
 #include "mlir/IR/TypeSupport.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 
 // clang-format off: must be included after all LLVM/MLIR headers.
 #include "iree/compiler/Dialect/Util/IR/UtilEnums.h.inc" // IWYU pragma: keep
@@ -104,6 +105,11 @@
 // Returns true if the move was successful.
 bool tryMoveProducerBefore(Value value, Operation *consumerOp);
 
+// Returns true if the given callable op is public or external (no body).
+// Such callables cannot have their signature changed without (potentially)
+// breaking linking.
+bool isPublicOrExternal(CallableOpInterface callableOp);
+
 //===----------------------------------------------------------------------===//
 // Global and structural interface utilities
 //===----------------------------------------------------------------------===//
@@ -130,6 +136,7 @@
 
 namespace detail {
 
+void getAllTiedOperands(Operation *op, SmallVectorImpl<int64_t> &indices);
 std::optional<unsigned> getTiedResultOperandIndex(Operation *op,
                                                   unsigned resultIndex);
 void setTiedResultOperandIndex(Operation *op, unsigned resultIndex,

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/alignment_folding.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/alignment_folding.mlir
index 3477dbe..522c13c 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/alignment_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/alignment_folding.mlir

@@ -2,65 +2,65 @@
 
 // CHECK-LABEL: @foldSameAlignment
 // CHECK-SAME: (%[[VALUE:.+]]: index, %[[ALIGNMENT:.+]]: index)
-func.func @foldSameAlignment(%value: index, %alignment: index) -> index {
+util.func public @foldSameAlignment(%value: index, %alignment: index) -> index {
   // CHECK: %[[RET:.+]] = util.align %[[VALUE]], %[[ALIGNMENT]]
   %0 = util.align %value, %alignment : index
   // CHECK-NOT: util.align
   %1 = util.align %0, %alignment : index
-  // CHECK: return %[[RET]]
-  return %1 : index
+  // CHECK: util.return %[[RET]]
+  util.return %1 : index
 }
 
 // -----
 
 // CHECK-LABEL: @foldGreaterAlignment
 // CHECK-SAME: (%[[VALUE:.+]]: index)
-func.func @foldGreaterAlignment(%value: index) -> index {
+util.func public @foldGreaterAlignment(%value: index) -> index {
   %c8 = arith.constant 8 : index
   %c16 = arith.constant 16 : index
   // CHECK: %[[RET:.+]] = util.align %[[VALUE]], %c16
   %0 = util.align %value, %c16 : index
   // CHECK-NOT: util.align
   %1 = util.align %0, %c8 : index
-  // CHECK: return %[[RET]]
-  return %1 : index
+  // CHECK: util.return %[[RET]]
+  util.return %1 : index
 }
 
 // -----
 
 // CHECK-LABEL: @dontFoldLesserAlignment
 // CHECK-SAME: (%[[VALUE:.+]]: index)
-func.func @dontFoldLesserAlignment(%value: index) -> index {
+util.func public @dontFoldLesserAlignment(%value: index) -> index {
   %c8 = arith.constant 8 : index
   %c16 = arith.constant 16 : index
   // CHECK: %[[ALIGN16:.+]] = util.align %[[VALUE]], %c8
   %0 = util.align %value, %c8 : index
   // CHECK: %[[ALIGN8:.+]] = util.align %[[ALIGN16]], %c16
   %1 = util.align %0, %c16 : index
-  // CHECK: return %[[ALIGN8]]
-  return %1 : index
+  // CHECK: util.return %[[ALIGN8]]
+  util.return %1 : index
 }
 
 // -----
 
 // CHECK-LABEL: @dontFoldMixedAlignment
 // CHECK-SAME: (%[[VALUE:.+]]: index)
-func.func @dontFoldMixedAlignment(%value: index) -> index {
+util.func public @dontFoldMixedAlignment(%value: index) -> index {
   %c9 = arith.constant 9 : index
   %c16 = arith.constant 16 : index
   // CHECK: %[[ALIGN16:.+]] = util.align %[[VALUE]], %c16
   %0 = util.align %value, %c16 : index
   // CHECK: %[[ALIGN9:.+]] = util.align %[[ALIGN16]], %c9
   %1 = util.align %0, %c9 : index
-  // CHECK: return %[[ALIGN9]]
-  return %1 : index
+  // CHECK: util.return %[[ALIGN9]]
+  util.return %1 : index
 }
 
 // -----
 
 // CHECK-LABEL: @foldAlignmentRecursively
 // CHECK-SAME: (%[[VALUE:.+]]: index, %[[ALIGNMENT:.+]]: index)
-func.func @foldAlignmentRecursively(%value: index, %alignment: index) -> index {
+util.func public @foldAlignmentRecursively(%value: index, %alignment: index) -> index {
   %c16 = arith.constant 16 : index
   // CHECK: %[[ALIGN16:.+]] = util.align %[[VALUE]], %c16
   %0 = util.align %value, %c16 : index
@@ -68,15 +68,15 @@
   %1 = util.align %0, %alignment : index
   // CHECK-NOT: util.align
   %2 = util.align %1, %c16 : index
-  // CHECK: return %[[ALIGN_DYNAMIC]]
-  return %2 : index
+  // CHECK: util.return %[[ALIGN_DYNAMIC]]
+  util.return %2 : index
 }
 
 // -----
 
 // CHECK-LABEL: @foldAddAlignment
 // CHECK-SAME: (%[[LHS:.+]]: index, %[[RHS:.+]]: index, %[[ALIGNMENT:.+]]: index)
-func.func @foldAddAlignment(%lhs: index, %rhs: index, %alignment: index) -> index {
+util.func public @foldAddAlignment(%lhs: index, %rhs: index, %alignment: index) -> index {
   // CHECK: %[[LHS_ALIGNED:.+]] = util.align %[[LHS]], %[[ALIGNMENT]]
   %lhs_aligned = util.align %lhs, %alignment : index
   // CHECK: %[[RHS_ALIGNED:.+]] = util.align %[[RHS]], %[[ALIGNMENT]]
@@ -85,15 +85,15 @@
   %sum_aligned = arith.addi %lhs_aligned, %rhs_aligned : index
   // CHECK-NOT: util.align
   %result = util.align %sum_aligned, %alignment : index
-  // CHECK: return %[[SUM_ALIGNED]]
-  return %result : index
+  // CHECK: util.return %[[SUM_ALIGNED]]
+  util.return %result : index
 }
 
 // -----
 
 // CHECK-LABEL: @foldAddAlignmentConstant
 // CHECK-SAME: (%[[LHS:.+]]: index)
-func.func @foldAddAlignmentConstant(%lhs: index) -> index {
+util.func public @foldAddAlignmentConstant(%lhs: index) -> index {
   %c16 = arith.constant 16 : index
   %c32 = arith.constant 32 : index
   %c64 = arith.constant 64 : index
@@ -103,29 +103,29 @@
   %sum_aligned = arith.addi %lhs_aligned, %c32 : index
   // CHECK-NOT: util.align
   %result = util.align %sum_aligned, %c16 : index
-  // CHECK: return %[[SUM_ALIGNED]]
-  return %result : index
+  // CHECK: util.return %[[SUM_ALIGNED]]
+  util.return %result : index
 }
 
 // -----
 
 // CHECK-LABEL: @foldMulAlignmentConstant
 // CHECK-SAME: (%[[LHS:.+]]: index)
-func.func @foldMulAlignmentConstant(%lhs: index) -> index {
+util.func public @foldMulAlignmentConstant(%lhs: index) -> index {
   %c64 = arith.constant 64 : index
   %c2048 = arith.constant 2048 : index
   // CHECK: %[[RESULT:.+]] = arith.muli %[[LHS]], %c2048
   %lhs_mul = arith.muli %lhs, %c2048 : index
   // CHECK-NOT: util.align
   %result = util.align %lhs_mul, %c64 : index
-  // CHECK: return %[[RESULT]]
-  return %result : index
+  // CHECK: util.return %[[RESULT]]
+  util.return %result : index
 }
 
 // -----
 
 // CHECK-LABEL: @foldConstantAlign
-func.func @foldConstantAlign() -> (index, index, index) {
+util.func public @foldConstantAlign() -> (index, index, index) {
   %c0 = arith.constant 0 : index
   %c7 = arith.constant 7 : index
   %c8 = arith.constant 8 : index
@@ -134,14 +134,14 @@
   %0 = util.align %c0, %c64 : index
   %1 = util.align %c7, %c8 : index
   %2 = util.align %c9, %c8 : index
-  // CHECK: return %c0, %c8, %c16
-  return %0, %1, %2 : index, index, index
+  // CHECK: util.return %c0, %c8, %c16
+  util.return %0, %1, %2 : index, index, index
 }
 
 // -----
 
 // CHECK-LABEL: @foldAffineAlign
-func.func @foldAffineAlign(%arg0: index) -> (index, index) {
+util.func public @foldAffineAlign(%arg0: index) -> (index, index) {
   // CHECK: %[[A0:.+]] = affine.apply affine_map<()[s0] -> (s0 * 16384)>()[%arg0]
   %a0 = affine.apply affine_map<()[s0] -> (s0 * 16384)>()[%arg0]
   %c64 = arith.constant 64 : index
@@ -150,33 +150,33 @@
   %b0 = affine.apply affine_map<()[s0] -> ((s0 * s0) * 4)>()[%arg0]
   %c4 = arith.constant 4 : index
   %b1 = util.align %b0, %c4 : index
-  // CHECK: return %[[A0]], %[[B0]]
-  return %a1, %b1 : index, index
+  // CHECK: util.return %[[A0]], %[[B0]]
+  util.return %a1, %b1 : index, index
 }
 
 // -----
 
 // CHECK-LABEL: @sizeofWholeInt
-func.func @sizeofWholeInt() -> index {
+util.func public @sizeofWholeInt() -> index {
   // CHECK: = arith.constant 4 : index
   %0 = util.sizeof i32
-  return %0 : index
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @sizeofSubByteInt
-func.func @sizeofSubByteInt() -> index {
+util.func public @sizeofSubByteInt() -> index {
   // CHECK: = arith.constant 2 : index
   %0 = util.sizeof i12
-  return %0 : index
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @sizeofFloat
-func.func @sizeofFloat() -> index {
+util.func public @sizeofFloat() -> index {
   // CHECK: = arith.constant 4 : index
   %0 = util.sizeof f32
-  return %0 : index
+  util.return %0 : index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/alignment_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/alignment_ops.mlir
index 8bf52f8..d6f45cb 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/alignment_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/alignment_ops.mlir

@@ -1,26 +1,26 @@
 // RUN: iree-opt --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @utilAlign
-func.func @utilAlign(%arg0 : index, %arg1: index) {
+util.func public @utilAlign(%arg0 : index, %arg1: index) {
   // CHECK: = util.align %arg0, %arg1 : index
   %result = util.align %arg0, %arg1 : index
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @utilAlignInt
-func.func @utilAlignInt(%arg0 : i32, %arg1: i32) {
+util.func public @utilAlignInt(%arg0 : i32, %arg1: i32) {
   // CHECK: = util.align %arg0, %arg1 : i32
   %result = util.align %arg0, %arg1 : i32
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @sizeofUnfoldable
-func.func @sizeofUnfoldable() -> index {
+util.func public @sizeofUnfoldable() -> index {
   // CHECK: = util.sizeof index
   %0 = util.sizeof index
-  return %0 : index
+  util.return %0 : index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/assignment_folding.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/assignment_folding.mlir
index a541266..d8c0261 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/assignment_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/assignment_folding.mlir

@@ -1,29 +1,29 @@
 // RUN: iree-opt --split-input-file --canonicalize %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @foldSwitchI32Nop
-func.func @foldSwitchI32Nop(%arg0 : index) -> i32 {
+util.func public @foldSwitchI32Nop(%arg0 : index) -> i32 {
   // CHECK: %[[DEFAULT:.+]] = arith.constant 5
   %c5 = arith.constant 5 : i32
   %0 = util.switch i32 from [] at %arg0 else %c5 : i32
-  // CHECK: return %[[DEFAULT]] : i32
-  return %0 : i32
+  // CHECK: util.return %[[DEFAULT]] : i32
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @foldSwitchI32Identical
-func.func @foldSwitchI32Identical(%arg0 : index) -> i32 {
+util.func public @foldSwitchI32Identical(%arg0 : index) -> i32 {
   // CHECK: %[[C100:.+]] = arith.constant 100
   %c100 = arith.constant 100 : i32
   %0 = util.switch i32 from [%c100, %c100, %c100] at %arg0 else %c100 : i32
-  // CHECK: return %[[C100]] : i32
-  return %0 : i32
+  // CHECK: util.return %[[C100]] : i32
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @foldSwitchI32ConstantIndex
-func.func @foldSwitchI32ConstantIndex() -> (i32, i32, i32, i32) {
+util.func public @foldSwitchI32ConstantIndex() -> (i32, i32, i32, i32) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -40,42 +40,42 @@
   %1 = util.switch i32 from [%c100, %c200, %c300] at %c1 else %c400 : i32
   %2 = util.switch i32 from [%c100, %c200, %c300] at %c2 else %c400 : i32
   %3 = util.switch i32 from [%c100, %c200, %c300] at %c3 else %c400 : i32
-  // CHECK: return %[[C100]], %[[C200]], %[[C300]], %[[C400]] : i32, i32, i32, i32
-  return %0, %1, %2, %3 : i32, i32, i32, i32
+  // CHECK: util.return %[[C100]], %[[C200]], %[[C300]], %[[C400]] : i32, i32, i32, i32
+  util.return %0, %1, %2, %3 : i32, i32, i32, i32
 }
 
 // -----
 
 // CHECK-LABEL: @foldCastSameType
 // CHECK-SAME: (%[[SOURCE:.+]]: !util.buffer)
-func.func @foldCastSameType(%source: !util.buffer) -> !util.buffer {
+util.func public @foldCastSameType(%source: !util.buffer) -> !util.buffer {
   // CHECK-NOT: util.cast
   %0 = util.cast %source : !util.buffer to !util.buffer
-  // CHECK: return %[[SOURCE]]
-  return %0 : !util.buffer
+  // CHECK: util.return %[[SOURCE]]
+  util.return %0 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @foldChainedCast
 // CHECK-SAME: (%[[SOURCE:.+]]: !util.buffer)
-func.func @foldChainedCast(%source: !util.buffer) -> !util.buffer {
+util.func public @foldChainedCast(%source: !util.buffer) -> !util.buffer {
   // CHECK-NOT: util.cast
   %0 = util.cast %source : !util.buffer to !util.object
   // CHECK-NOT: util.cast
   %1 = util.cast %0 : !util.object to !util.buffer
-  // CHECK: return %[[SOURCE]]
-  return %1 : !util.buffer
+  // CHECK: util.return %[[SOURCE]]
+  util.return %1 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @foldCastIntoNullOp
-func.func @foldCastIntoNullOp() -> !util.buffer {
+util.func public @foldCastIntoNullOp() -> !util.buffer {
   // CHECK: %[[NULL:.+]] = util.null : !util.buffer
   %0 = util.null : !util.object
   // CHECK-NOT: util.cast
   %1 = util.cast %0 : !util.object to !util.buffer
-  // CHECK: return %[[NULL]]
-  return %1 : !util.buffer
+  // CHECK: util.return %[[NULL]]
+  util.return %1 : !util.buffer
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/assignment_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/assignment_ops.mlir
index 3df0fd7..9c420a7 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/assignment_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/assignment_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @switch
 // CHECK-SAME: (%[[INDEX:.+]]: index)
-func.func @switch(%index: index) -> i32 {
+util.func public @switch(%index: index) -> i32 {
   // CHECK-DAG: %[[C100:.+]] = arith.constant 100
   %c100 = arith.constant 100 : i32
   // CHECK-DAG: %[[C200:.+]] = arith.constant 200
@@ -13,15 +13,15 @@
   %default = arith.constant 400 : i32
   // CHECK: = util.switch i32 from [%[[C100]], %[[C200]], %[[C300]]] at %[[INDEX]] else %[[DEFAULT]] : i32
   %0 = util.switch i32 from [%c100, %c200, %c300] at %index else %default : i32
-  return %0 : i32
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @cast
 // CHECK-SAME: (%[[SOURCE:.+]]: !util.buffer)
-func.func @cast(%source: !util.buffer) -> !util.object {
+util.func public @cast(%source: !util.buffer) -> !util.object {
   // CHECK: = util.cast %[[SOURCE]] : !util.buffer to !util.object
   %0 = util.cast %source : !util.buffer to !util.object
-  return %0 : !util.object
+  util.return %0 : !util.object
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/buffer_folding.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/buffer_folding.mlir
index 811dacb..2ee5bbb 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/buffer_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/buffer_folding.mlir

@@ -1,30 +1,30 @@
 // RUN: iree-opt --split-input-file --canonicalize %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @FoldSubspansIntoSliceOp
-func.func @FoldSubspansIntoSliceOp(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index) -> !util.buffer {
+util.func public @FoldSubspansIntoSliceOp(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index) -> !util.buffer {
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   // CHECK: %[[OFFSET:.+]] = arith.addi %arg2, %c100
   %0 = util.buffer.subspan %arg0[%arg2] : !util.buffer{%arg1} -> !util.buffer{%arg3}
   // CHECK: util.buffer.slice %arg0[%[[OFFSET]]] : !util.buffer{%arg1} -> !util.buffer{%c200}
   %1 = util.buffer.slice %0[%c100] : !util.buffer{%arg3} -> !util.buffer{%c200}
-  return %1 : !util.buffer
+  util.return %1 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @FoldBufferSubspanOp
-func.func @FoldBufferSubspanOp(%arg0: !util.buffer, %arg1: index, %arg2: index) -> !util.buffer {
+util.func public @FoldBufferSubspanOp(%arg0: !util.buffer, %arg1: index, %arg2: index) -> !util.buffer {
   // CHECK-NOT: util.buffer.subspan
   %0 = util.buffer.subspan %arg0[%arg1] : !util.buffer{%arg2} -> !util.buffer{%arg2}
-  // CHECK: return %arg0
-  return %0 : !util.buffer
+  // CHECK: util.return %arg0
+  util.return %0 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @FoldBufferSubspanOps
-func.func @FoldBufferSubspanOps(%arg0: !util.buffer, %arg1: index) -> !util.buffer {
+util.func public @FoldBufferSubspanOps(%arg0: !util.buffer, %arg1: index) -> !util.buffer {
   %c100 = arith.constant 100 : index
   %c300 = arith.constant 300 : index
   %c400 = arith.constant 400 : index
@@ -33,14 +33,14 @@
   %0 = util.buffer.subspan %arg0[%c100] : !util.buffer{%arg1} -> !util.buffer{%c500}
   %1 = util.buffer.subspan %0[%c100] : !util.buffer{%c500} -> !util.buffer{%c400}
   %2 = util.buffer.subspan %1[%c100] : !util.buffer{%c400} -> !util.buffer{%c300}
-  // CHECK: return %[[RET]]
-  return %2 : !util.buffer
+  // CHECK: util.return %[[RET]]
+  util.return %2 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @SinkSubspanAcrossSelectOps
-func.func @SinkSubspanAcrossSelectOps(%arg0: !util.buffer, %arg1: i1) -> !util.buffer {
+util.func public @SinkSubspanAcrossSelectOps(%arg0: !util.buffer, %arg1: i1) -> !util.buffer {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
@@ -51,29 +51,29 @@
   // CHECK: %[[OFFSET:.+]] = arith.select %arg1, %c0, %c128 : index
   %2 = arith.select %arg1, %0, %1 : !util.buffer
   // CHECK-NEXT: %[[SUBSPAN:.+]] = util.buffer.subspan %arg0[%[[OFFSET]]] : !util.buffer{%c256} -> !util.buffer{%c128}
-  // CHECK-NEXT: return %[[SUBSPAN]]
-  return %2 : !util.buffer
+  // CHECK-NEXT: util.return %[[SUBSPAN]]
+  util.return %2 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @FoldBufferSizeOp
-func.func @FoldBufferSizeOp(%arg0: !util.buffer, %arg1: index) -> (index, i32) {
+util.func public @FoldBufferSizeOp(%arg0: !util.buffer, %arg1: index) -> (index, i32) {
   %c0 = arith.constant 0 : index
   %c4 = arith.constant 4 : index
   // CHECK-NOT: util.buffer.size
   %0 = util.buffer.size %arg0 : !util.buffer
   // CHECK: %[[LOAD:.+]] = util.buffer.load
   %1 = util.buffer.load %arg0[%c0 for %c4] : !util.buffer{%arg1} -> i32
-  // CHECK: return %arg1, %[[LOAD]]
-  return %0, %1 : index, i32
+  // CHECK: util.return %arg1, %[[LOAD]]
+  util.return %0, %1 : index, i32
 }
 
 // -----
 
 // CHECK-LABEL: @FoldNestedBufferSizeOp
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer)
-func.func @FoldNestedBufferSizeOp(%buffer: !util.buffer) {
+util.func public @FoldNestedBufferSizeOp(%buffer: !util.buffer) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
@@ -90,25 +90,25 @@
   // CHECK: util.buffer.load %[[BUFFER]]{{.+}} : !util.buffer{%[[BUFFER_SIZE_OUTER]]}
   %outer = util.buffer.load %buffer[%c128 for %c1] : !util.buffer{%buffer_size_outer} -> i8
   util.optimization_barrier %outer : i8
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @FoldConstantBufferSizeOp
-func.func @FoldConstantBufferSizeOp() -> index {
+util.func public @FoldConstantBufferSizeOp() -> index {
   // CHECK-NOT: util.buffer.constant
   %0 = util.buffer.constant : !util.buffer = dense<[1, 2, 3]> : tensor<3xi32>
   // CHECK-NOT: util.buffer.size
   %1 = util.buffer.size %0 : !util.buffer
-  // CHECK: return %c12
-  return %1 : index
+  // CHECK: util.return %c12
+  util.return %1 : index
 }
 
 // -----
 
 // CHECK-LABEL: @SelectBufferSizeOp
-func.func @SelectBufferSizeOp(%arg0: !util.buffer, %arg1: index, %arg2: !util.buffer, %arg3: index, %arg4: i1) -> (!util.buffer, index) {
+util.func public @SelectBufferSizeOp(%arg0: !util.buffer, %arg1: index, %arg2: !util.buffer, %arg3: index, %arg4: i1) -> (!util.buffer, index) {
   %c0 = arith.constant 0 : index
   // CHECK: %[[ARG0_T:.+]] = util.buffer.slice %arg0[%c0] : !util.buffer{%[[ARG0_SZ:.+]]} ->
   %0 = util.buffer.slice %arg0[%c0] : !util.buffer{%arg1} -> !util.buffer{%arg1}
@@ -120,26 +120,26 @@
   %3 = util.buffer.size %2 : !util.buffer
   // CHECK: = util.buffer.slice %[[RET_T]][%c0] : !util.buffer{%[[RET_SIZE]]} ->
   %4 = util.buffer.slice %2[%c0] : !util.buffer{%3} -> !util.buffer{%3}
-  return %4, %3 : !util.buffer, index
+  util.return %4, %3 : !util.buffer, index
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubspansIntoStorageOp
-func.func @FoldSubspansIntoStorageOp(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index) -> (memref<?xi8>, index) {
+util.func public @FoldSubspansIntoStorageOp(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index) -> (memref<?xi8>, index) {
   // CHECK-NOT: util.buffer.subspan
   %0 = util.buffer.subspan %arg0[%arg2] : !util.buffer{%arg1} -> !util.buffer{%arg3}
   // CHECK: %[[STORAGE:.+]], %[[OFFSET:.+]] = util.buffer.storage %arg0 : !util.buffer{%arg1} -> (memref<?xi8>, index)
   %1:2 = util.buffer.storage %0 : !util.buffer{%arg3} -> (memref<?xi8>, index)
   // CHECK: %[[ADJUSTED_OFFSET:.+]] = arith.addi %arg2, %[[OFFSET]]
-  // CHECK: return %[[STORAGE]], %[[ADJUSTED_OFFSET]]
-  return %1#0, %1#1 : memref<?xi8>, index
+  // CHECK: util.return %[[STORAGE]], %[[ADJUSTED_OFFSET]]
+  util.return %1#0, %1#1 : memref<?xi8>, index
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubspansIntoCopyOp
-func.func @FoldSubspansIntoCopyOp(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) {
+util.func public @FoldSubspansIntoCopyOp(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) {
   %c1 = arith.constant 1 : index
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -149,13 +149,13 @@
   %1 = util.buffer.subspan %arg0[%arg4] : !util.buffer{%arg1} -> !util.buffer{%arg5}
   // CHECK: util.buffer.copy %arg0[%[[OFFSET_SRC]]], %arg0[%[[OFFSET_DST]]], %c1 : !util.buffer{%arg1} -> !util.buffer{%arg1}
   util.buffer.copy %0[%c100], %1[%c200], %c1 : !util.buffer{%arg3} -> !util.buffer{%arg5}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubspansIntoCompareOp
-func.func @FoldSubspansIntoCompareOp(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> i1 {
+util.func public @FoldSubspansIntoCompareOp(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> i1 {
   %c1 = arith.constant 1 : index
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -165,26 +165,26 @@
   %1 = util.buffer.subspan %arg0[%arg4] : !util.buffer{%arg1} -> !util.buffer{%arg5}
   // CHECK: = util.buffer.compare %arg0[%[[OFFSET_LHS]]], %arg0[%[[OFFSET_RHS]]], %c1 : !util.buffer{%arg1}, !util.buffer{%arg1}
   %2 = util.buffer.compare %0[%c100], %1[%c200], %c1 : !util.buffer{%arg3}, !util.buffer{%arg5}
-  return %2 : i1
+  util.return %2 : i1
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubspansIntoFillOp
-func.func @FoldSubspansIntoFillOp(%arg0: !util.buffer, %arg1: index, %arg2: i32, %arg3: index, %arg4: index) {
+util.func public @FoldSubspansIntoFillOp(%arg0: !util.buffer, %arg1: index, %arg2: i32, %arg3: index, %arg4: index) {
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   // CHECK: %[[OFFSET:.+]] = arith.addi %arg3, %c100
   %0 = util.buffer.subspan %arg0[%arg3] : !util.buffer{%arg1} -> !util.buffer{%arg4}
   // CHECK: util.buffer.fill %arg2, %arg0[%[[OFFSET]] for %c200] : i32 -> !util.buffer{%arg1}
   util.buffer.fill %arg2, %0[%c100 for %c200] : i32 -> !util.buffer{%arg4}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubspanIntoLoadOp
-func.func @FoldSubspanIntoLoadOp(%arg0: !util.buffer, %arg1: index) -> i32 {
+util.func public @FoldSubspanIntoLoadOp(%arg0: !util.buffer, %arg1: index) -> i32 {
   %c4 = arith.constant 4 : index
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
@@ -193,13 +193,13 @@
   %0 = util.buffer.subspan %arg0[%c128] : !util.buffer{%arg1} -> !util.buffer{%c256}
   // CHECK: = util.buffer.load %arg0[%c192 for %c4] : !util.buffer{%arg1} -> i32
   %1 = util.buffer.load %0[%c64 for %c4] : !util.buffer{%c256} -> i32
-  return %1 : i32
+  util.return %1 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @FoldSubspanIntoStoreOp
-func.func @FoldSubspanIntoStoreOp(%arg0: !util.buffer, %arg1: index) {
+util.func public @FoldSubspanIntoStoreOp(%arg0: !util.buffer, %arg1: index) {
   %c4 = arith.constant 4 : index
   %c64 = arith.constant 64 : index
   %c128 = arith.constant 128 : index
@@ -209,5 +209,5 @@
   %0 = util.buffer.subspan %arg0[%c128] : !util.buffer{%arg1} -> !util.buffer{%c256}
   // CHECK: util.buffer.store %c123_i32, %arg0[%c192 for %c4] : i32 -> !util.buffer{%arg1}
   util.buffer.store %c123_i32, %0[%c64 for %c4] : i32 -> !util.buffer{%c256}
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/buffer_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/buffer_ops.mlir
index 736ae48..947e2e0 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/buffer_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/buffer_ops.mlir

@@ -1,139 +1,139 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @buffer_constant
-func.func @buffer_constant() -> !util.buffer {
+util.func public @buffer_constant() -> !util.buffer {
   // CHECK: = util.buffer.constant : !util.buffer = dense<[1, 2, 3]> : tensor<3xi32>
   %0 = util.buffer.constant : !util.buffer = dense<[1, 2, 3]> : tensor<3xi32>
-  return %0 : !util.buffer
+  util.return %0 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_constant_string
-func.func @buffer_constant_string() -> !util.buffer {
+util.func public @buffer_constant_string() -> !util.buffer {
   // CHECK: = util.buffer.constant : !util.buffer = "hello"
   %0 = util.buffer.constant : !util.buffer = "hello"
-  return %0 : !util.buffer
+  util.return %0 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_alloc
-func.func @buffer_alloc(%arg0: index) -> !util.buffer {
+util.func public @buffer_alloc(%arg0: index) -> !util.buffer {
   // CHECK: = util.buffer.alloc uninitialized {alignment = 16 : index} : !util.buffer{%arg0}
   %0 = util.buffer.alloc uninitialized {alignment = 16 : index} : !util.buffer{%arg0}
-  return %0 : !util.buffer
+  util.return %0 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_dealloc
-func.func @buffer_dealloc(%arg0: !util.buffer, %arg1: index) {
+util.func public @buffer_dealloc(%arg0: !util.buffer, %arg1: index) {
   // CHECK: util.buffer.dealloc %arg0 : !util.buffer{%arg1}
   util.buffer.dealloc %arg0 : !util.buffer{%arg1}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_slice
-func.func @buffer_slice(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index) -> !util.buffer {
+util.func public @buffer_slice(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index) -> !util.buffer {
   // CHECK: = util.buffer.slice %arg0[%arg1] : !util.buffer{%arg2} -> !util.buffer{%arg3}
   %0 = util.buffer.slice %arg0[%arg1] : !util.buffer{%arg2} -> !util.buffer{%arg3}
-  return %0 : !util.buffer
+  util.return %0 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_subspan
-func.func @buffer_subspan(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index) -> !util.buffer {
+util.func public @buffer_subspan(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: index) -> !util.buffer {
   // CHECK: = util.buffer.subspan %arg0[%arg1] : !util.buffer{%arg2} -> !util.buffer{%arg3}
   %0 = util.buffer.subspan %arg0[%arg1] : !util.buffer{%arg2} -> !util.buffer{%arg3}
-  return %0 : !util.buffer
+  util.return %0 : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_size
-func.func @buffer_size(%arg0: !util.buffer) -> index {
+util.func public @buffer_size(%arg0: !util.buffer) -> index {
   // CHECK: = util.buffer.size %arg0 : !util.buffer
   %0 = util.buffer.size %arg0 : !util.buffer
-  return %0 : index
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_storage
-func.func @buffer_storage(%arg0: !util.buffer, %arg1: index) -> (memref<?xi8>, index) {
+util.func public @buffer_storage(%arg0: !util.buffer, %arg1: index) -> (memref<?xi8>, index) {
   // CHECK: = util.buffer.storage %arg0 : !util.buffer{%arg1} -> (memref<?xi8>, index)
   %0, %1 = util.buffer.storage %arg0 : !util.buffer{%arg1} -> (memref<?xi8>, index)
-  return %0, %1 : memref<?xi8>, index
+  util.return %0, %1 : memref<?xi8>, index
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_copy
-func.func @buffer_copy(%arg0: !util.buffer, %arg1: index) {
+util.func public @buffer_copy(%arg0: !util.buffer, %arg1: index) {
   %c1 = arith.constant 1 : index
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   // CHECK: util.buffer.copy %arg0[%c100], %arg0[%c200], %c1 : !util.buffer{%arg1} -> !util.buffer{%arg1}
   util.buffer.copy %arg0[%c100], %arg0[%c200], %c1 : !util.buffer{%arg1} -> !util.buffer{%arg1}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_compare
-func.func @buffer_compare(%arg0: !util.buffer, %arg1: index) -> i1 {
+util.func public @buffer_compare(%arg0: !util.buffer, %arg1: index) -> i1 {
   %c1 = arith.constant 1 : index
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   // CHECK: = util.buffer.compare %arg0[%c100], %arg0[%c200], %c1 : !util.buffer{%arg1}, !util.buffer{%arg1}
   %0 = util.buffer.compare %arg0[%c100], %arg0[%c200], %c1 : !util.buffer{%arg1}, !util.buffer{%arg1}
-  return %0 : i1
+  util.return %0 : i1
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_fill
-func.func @buffer_fill(%arg0: !util.buffer, %arg1: index, %arg2: i32) {
+util.func public @buffer_fill(%arg0: !util.buffer, %arg1: index, %arg2: i32) {
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   // CHECK: util.buffer.fill %arg2, %arg0[%c100 for %c200] : i32 -> !util.buffer{%arg1}
   util.buffer.fill %arg2, %arg0[%c100 for %c200] : i32 -> !util.buffer{%arg1}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_load
-func.func @buffer_load(%arg0: !util.buffer, %arg1: index) -> i32 {
+util.func public @buffer_load(%arg0: !util.buffer, %arg1: index) -> i32 {
   %c4 = arith.constant 4 : index
   %c100 = arith.constant 100 : index
   // CHECK: = util.buffer.load %arg0[%c100 for %c4] : !util.buffer{%arg1} -> i32
   %0 = util.buffer.load %arg0[%c100 for %c4] : !util.buffer{%arg1} -> i32
-  return %0 : i32
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_store
-func.func @buffer_store(%arg0: !util.buffer, %arg1: index, %arg2: i32) {
+util.func public @buffer_store(%arg0: !util.buffer, %arg1: index, %arg2: i32) {
   %c4 = arith.constant 4 : index
   %c100 = arith.constant 100 : index
   // CHECK: util.buffer.store %arg2, %arg0[%c100 for %c4] : i32 -> !util.buffer{%arg1}
   util.buffer.store %arg2, %arg0[%c100 for %c4] : i32 -> !util.buffer{%arg1}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_hash
-func.func @buffer_hash(%arg0: !util.buffer, %arg1: index) -> i64 {
+util.func public @buffer_hash(%arg0: !util.buffer, %arg1: index) -> i64 {
   %c17 = arith.constant 17 : index
   %c100 = arith.constant 100 : index
   // CHECK: = util.buffer.hash %arg0[%c100 for %c17] : !util.buffer{%arg1} -> i64
   %0 = util.buffer.hash %arg0[%c100 for %c17] : !util.buffer{%arg1} -> i64
-  return %0 : i64
+  util.return %0 : i64
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/global_folding.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/global_folding.mlir
index fc47404..669a2f5 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/global_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/global_folding.mlir

@@ -13,51 +13,51 @@
 
 util.global private @v_unused : tensor<4xi32>
 // CHECK-LABEL: @unused_load
-func.func @unused_load() {
-  // CHECK-NEXT: return
+util.func public @unused_load() {
+  // CHECK-NEXT: util.return
   %0 = util.global.load @v_unused : tensor<4xi32>
-  return
+  util.return
 }
 
 // -----
 
 util.global private @v_const {inlining_policy = #util.inline.never} = dense<1.0> : tensor<8xf32>
 // CHECK-LABEL: @no_fold_noinline_immutable_const
-func.func @no_fold_noinline_immutable_const() -> tensor<8xf32> {
+util.func public @no_fold_noinline_immutable_const() -> tensor<8xf32> {
   // CHECK-NEXT: = util.global.load @v_const : tensor<8xf32>
   %0 = util.global.load @v_const : tensor<8xf32>
-  return %0 : tensor<8xf32>
+  util.return %0 : tensor<8xf32>
 }
 
 // -----
 
 util.global private mutable @v_nop : tensor<4xi32>
 // CHECK-LABEL: @nop_load_store
-func.func @nop_load_store() {
-  // CHECK-NEXT: return
+util.func public @nop_load_store() {
+  // CHECK-NEXT: util.return
   %0 = util.global.load @v_nop : tensor<4xi32>
   util.global.store %0, @v_nop : tensor<4xi32>
-  return
+  util.return
 }
 
 // -----
 
 util.global private @v : tensor<4xf32>
 // CHECK-LABEL: @fold_load_indirect
-func.func @fold_load_indirect() -> tensor<4xf32> {
+util.func public @fold_load_indirect() -> tensor<4xf32> {
   %0 = util.global.address @v : !util.ptr<tensor<4xf32>>
   // CHECK-NEXT: = util.global.load @v
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xf32>> -> tensor<4xf32>
-  return %1 : tensor<4xf32>
+  util.return %1 : tensor<4xf32>
 }
 
 // -----
 
 util.global private mutable @v : tensor<4xf32>
 // CHECK-LABEL: @fold_store_indirect
-func.func @fold_store_indirect(%arg0 : tensor<4xf32>) {
+util.func public @fold_store_indirect(%arg0 : tensor<4xf32>) {
   %0 = util.global.address @v : !util.ptr<tensor<4xf32>>
   // CHECK-NEXT: util.global.store %arg0, @v
   util.global.store.indirect %arg0, %0 : tensor<4xf32> -> !util.ptr<tensor<4xf32>>
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/global_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/global_ops.mlir
index b19b1b0..d5ff734 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/global_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/global_ops.mlir

@@ -43,34 +43,34 @@
 
 util.global private @v_loaded : tensor<4xi32>
 // CHECK-LABEL: @loaded
-func.func @loaded() {
+util.func public @loaded() {
   // CHECK-NEXT: = util.global.load @v_loaded : tensor<4xi32>
   %0 = util.global.load @v_loaded : tensor<4xi32>
-  return
+  util.return
 }
 
 // -----
 
 util.global private mutable @v_stored : tensor<4xi32>
 // CHECK-LABEL: @stored
-func.func @stored() {
+util.func public @stored() {
   // CHECK-NEXT: %[[VAL:.+]] = arith.constant
   %cst = arith.constant dense<5> : tensor<4xi32>
   // CHECK-NEXT: util.global.store %[[VAL]], @v_stored : tensor<4xi32>
   util.global.store %cst, @v_stored : tensor<4xi32>
-  return
+  util.return
 }
 
 // -----
 
 util.global private @v_loaded : tensor<4xf32>
 // CHECK-LABEL: @loaded_indirect
-func.func @loaded_indirect() {
+util.func public @loaded_indirect() {
   // CHECK-NEXT: %[[ADDR:.+]] = util.global.address @v_loaded
   %0 = util.global.address @v_loaded : !util.ptr<tensor<4xf32>>
   // CHECK-NEXT: = util.global.load.indirect %[[ADDR]]
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xf32>> -> tensor<4xf32>
-  return
+  util.return
 }
 
 // -----
@@ -78,10 +78,10 @@
 util.global private mutable @v_stored : tensor<4xf32>
 // CHECK-LABEL: @stored_indirect
 // CHECK-SAME: (%[[VALUE:.+]]: tensor<4xf32>)
-func.func @stored_indirect(%arg0: tensor<4xf32>) {
+util.func public @stored_indirect(%arg0: tensor<4xf32>) {
   // CHECK-NEXT: %[[ADDR:.+]] = util.global.address @v_stored
   %0 = util.global.address @v_stored : !util.ptr<tensor<4xf32>>
   // CHECK-NEXT: util.global.store.indirect %[[VALUE]], %[[ADDR]]
   util.global.store.indirect %arg0, %0 : tensor<4xf32> -> !util.ptr<tensor<4xf32>>
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/hint_folding.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/hint_folding.mlir
index e78803f..a1bc0b0 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/hint_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/hint_folding.mlir

@@ -1,63 +1,63 @@
 // RUN: iree-opt --verify-diagnostics --canonicalize --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @no_fold_constant
-func.func @no_fold_constant() -> (i32) {
+util.func public @no_fold_constant() -> (i32) {
   // CHECK: constant 1 : i32
   %0 = arith.constant 1 : i32
   // CHECK: util.optimization_barrier
   %1 = "util.optimization_barrier"(%0) : (i32) -> i32
-  return %1 : i32
+  util.return %1 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @no_fold_add
-func.func @no_fold_add() -> (i32) {
+util.func public @no_fold_add() -> (i32) {
   // CHECK-NEXT: %[[C1:.+]] = vm.const.i32 1
   %c1 = vm.const.i32 1
   // CHECK-NEXT: %[[R1:.+]] = util.optimization_barrier %[[C1]]
   %0 = util.optimization_barrier %c1 : i32
   // CHECK-NEXT: %[[R2:.+]] = vm.add.i32 %[[R1]], %[[R1]]
   %1 = vm.add.i32 %0, %0 : i32
-  // CHECK-NEXT: return %[[R2]]
-  return %1 : i32
+  // CHECK-NEXT: util.return %[[R2]]
+  util.return %1 : i32
 }
 
 // -----
 
 // Exists to check that the above succeeds when there's no barrier.
 // CHECK-LABEL: @fold_add
-func.func @fold_add() -> (i32) {
+util.func public @fold_add() -> (i32) {
   // CHECK-NEXT: %[[C2:.+]] = vm.const.i32 2
-  // CHECK-NEXT: return %[[C2]]
+  // CHECK-NEXT: util.return %[[C2]]
   %c1 = vm.const.i32 1
   %0 = vm.add.i32 %c1, %c1 : i32
-  return %0 : i32
+  util.return %0 : i32
 }
 
 // -----
 
-func.func @result_operand_count_mismatch(%arg0 : tensor<i32>, %arg1 : tensor<i32>) {
+util.func public @result_operand_count_mismatch(%arg0 : tensor<i32>, %arg1 : tensor<i32>) {
   // expected-error@+1 {{must have same number of operands and results}}
   %1 = "util.optimization_barrier"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-  return
+  util.return
 }
 
 // -----
 
-func.func @result_operand_type_mismatch(%arg0 : tensor<i32>, %arg1 : tensor<i32>) {
+util.func public @result_operand_type_mismatch(%arg0 : tensor<i32>, %arg1 : tensor<i32>) {
   // expected-error@+1 {{must have same operand and result types, but they differ at index 1}}
   %1:2 = "util.optimization_barrier"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, memref<i32>)
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @canonicalize_unfoldable_constant
-func.func @canonicalize_unfoldable_constant() -> i32 {
+util.func public @canonicalize_unfoldable_constant() -> i32 {
   // CHECK-NEXT: %[[C:.+]] = arith.constant 42 : i32
   // CHECK-NEXT: %[[R:.+]] = util.optimization_barrier %[[C]] : i32
   %c42 = util.unfoldable_constant 42 : i32
-  // CHECK-NEXT: return %[[R]]
-  return %c42 : i32
+  // CHECK-NEXT: util.return %[[R]]
+  util.return %c42 : i32
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/hint_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/hint_ops.mlir
index d8e79c5..8d21da5 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/hint_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/hint_ops.mlir

@@ -3,7 +3,7 @@
 // CHECK-LABEL: @parse_print_barrier
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9$._-]+]]
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9$._-]+]]
-func.func @parse_print_barrier(%arg0 : tensor<i32>, %arg1 : tensor<i32>) {
+util.func public @parse_print_barrier(%arg0 : tensor<i32>, %arg1 : tensor<i32>) {
   // CHECK-NEXT: util.optimization_barrier %[[ARG0]] : tensor<i32>
   %1 = util.optimization_barrier %arg0 : tensor<i32>
 
@@ -13,13 +13,13 @@
   // CHECK-NEXT: util.optimization_barrier {some_unit} %[[ARG0]] : tensor<i32>
   %has_attr = util.optimization_barrier {some_unit} %arg0 : tensor<i32>
 
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @parse_print_unfoldable_constant
-func.func @parse_print_unfoldable_constant(%arg0 : tensor<i32>, %arg1 : tensor<i32>) {
+util.func public @parse_print_unfoldable_constant(%arg0 : tensor<i32>, %arg1 : tensor<i32>) {
   // CHECK-NEXT: util.unfoldable_constant 42
   %c42 = util.unfoldable_constant 42 : i32
 
@@ -29,5 +29,5 @@
   // CHECK: util.unfoldable_constant @func_with_args : (f32) -> ()
   %csymref = util.unfoldable_constant @func_with_args : (f32) -> ()
 
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/list_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/list_ops.mlir
index 641e0cd..43eb04a 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/list_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/list_ops.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @list_init_ops
-func.func @list_init_ops() {
+util.func public @list_init_ops() {
   // CHECK: %[[CAPACITY:.+]] = arith.constant 5
   %capacity = arith.constant 5 : index
   // CHECK: = util.list.create %[[CAPACITY]] : !util.list<?>
@@ -15,14 +15,14 @@
   // CHECK: util.list.resize %[[LIST]], %[[NEW_SIZE]] : !util.list<?>
   util.list.resize %list, %new_size : !util.list<?>
 
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @list_access
 // CHECK-SAME: (%[[LIST:.+]]: !util.list<i32>)
-func.func @list_access(%list: !util.list<i32>) {
+util.func public @list_access(%list: !util.list<i32>) {
   %c10 = arith.constant 10 : index
 
   // CHECK: = util.list.get %[[LIST]][%c10] : !util.list<i32>
@@ -35,14 +35,14 @@
   // CHECK: util.list.set %[[LIST]][%c10], %[[NEW_VALUE]] : !util.list<i32>
   util.list.set %list[%c10], %new_value : !util.list<i32>
 
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @list_access_tensor
 // CHECK-SAME: (%[[LIST:.+]]: !util.list<tensor<*xf32>>)
-func.func @list_access_tensor(%list: !util.list<tensor<*xf32>>) {
+util.func public @list_access_tensor(%list: !util.list<tensor<*xf32>>) {
   %c10 = arith.constant 10 : index
 
   // CHECK: = util.list.get %[[LIST]][%c10] : !util.list<tensor<*xf32>> -> tensor<?xf32>
@@ -53,14 +53,14 @@
   // CHECK: util.list.set %[[LIST]][%c10], %[[NEW_VALUE]] : tensor<5xi32> -> !util.list<tensor<*xf32>>
   util.list.set %list[%c10], %new_value : tensor<5xi32> -> !util.list<tensor<*xf32>>
 
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @list_access_variant
 // CHECK-SAME: (%[[LIST:.+]]: !util.list<?>)
-func.func @list_access_variant(%list: !util.list<?>) {
+util.func public @list_access_variant(%list: !util.list<?>) {
   %c10 = arith.constant 10 : index
   %c11 = arith.constant 11 : index
 
@@ -80,5 +80,5 @@
   // CHECK: util.list.set %[[LIST]][%c11], %[[NEW_TENSOR_VALUE]] : tensor<5xi32> -> !util.list<?>
   util.list.set %list[%c11], %new_tensor_value : tensor<5xi32> -> !util.list<?>
 
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/numeric_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/numeric_ops.mlir
index 3e7ca7a..e366196 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/numeric_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/numeric_ops.mlir

@@ -1,19 +1,19 @@
 // RUN: iree-opt --split-input-file %s | FileCheck %s
 
-func.func @optional_convert_scalar(%arg0 : i32) -> i32 {
+util.func public @optional_convert_scalar(%arg0 : i32) -> i32 {
   // CHECK: util.numeric.optional_narrow %arg0 : i32 as si8
   %0 = util.numeric.optional_narrow %arg0 : i32 as si8
-  return %0 : i32
+  util.return %0 : i32
 }
 
-func.func @optional_convert_tensor(%arg0 : tensor<f32>) -> tensor<f32> {
+util.func public @optional_convert_tensor(%arg0 : tensor<f32>) -> tensor<f32> {
   // CHECK: util.numeric.optional_narrow %arg0 : tensor<f32> as si8
   %0 = util.numeric.optional_narrow %arg0 : tensor<f32> as si8
-  return %0 : tensor<f32>
+  util.return %0 : tensor<f32>
 }
 
-func.func @optional_convert_zero(%arg0 : i32) -> i32 {
+util.func public @optional_convert_zero(%arg0 : i32) -> i32 {
   // CHECK: util.numeric.optional_narrow %arg0 : i32 as ui0
   %0 = util.numeric.optional_narrow %arg0 : i32 as ui0
-  return %0 : i32
+  util.return %0 : i32
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/range_folding.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/range_folding.mlir
index 5bc4184..5908d2b 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/range_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/range_folding.mlir

@@ -3,7 +3,7 @@
 // NOTE: util.range.min and util.range.max share their code so we just test min.
 
 // CHECK-LABEL: @rangeMinConstant
-func.func @rangeMinConstant() -> (index, index) {
+util.func public @rangeMinConstant() -> (index, index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -12,24 +12,24 @@
   %0 = util.range.min %c0 : index
   // CHECK-DAG: %[[C1:.+]] = arith.constant 1
   %1 = util.range.min %c3, %c1, %c2 : index
-  // CHECK: return %[[C0]], %[[C1]]
-  return %0, %1 : index, index
+  // CHECK: util.return %[[C0]], %[[C1]]
+  util.return %0, %1 : index, index
 }
 
 // -----
 
 // CHECK-LABEL: @rangeMinExpand
-func.func @rangeMinExpand(%arg0: index, %arg1: index) -> index {
+util.func public @rangeMinExpand(%arg0: index, %arg1: index) -> index {
   // CHECK: %[[MIN:.+]] = arith.minui %arg0, %arg1 : index
   %0 = util.range.min %arg0, %arg1 : index
-  // CHECK: return %[[MIN]]
-  return %0 : index
+  // CHECK: util.return %[[MIN]]
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @rangeMinSimplify
-func.func @rangeMinSimplify(%arg0: index, %arg1: index) -> (index, index) {
+util.func public @rangeMinSimplify(%arg0: index, %arg1: index) -> (index, index) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -38,26 +38,26 @@
   %0 = util.range.min %arg0, %c0, %arg0, %arg1 : index
   // CHECK: %[[MIN1:.+]] = util.range.min %arg0, %arg1, %c1 : index
   %1 = util.range.min %c3, %arg0, %c1, %arg1, %c2, %arg1 : index
-  // CHECK: return %[[MIN0]], %[[MIN1]]
-  return %0, %1 : index, index
+  // CHECK: util.return %[[MIN0]], %[[MIN1]]
+  util.return %0, %1 : index, index
 }
 
 // -----
 
 // CHECK-LABEL: @rangeExtentsFoldConstants
-func.func @rangeExtentsFoldConstants() -> (index, index) {
+util.func public @rangeExtentsFoldConstants() -> (index, index) {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
   %0:2 = util.range.extents [%c1 for %c2], [%c2 for %c3] : index
-  // CHECK: return %c1, %c4
-  return %0#0, %0#1 : index, index
+  // CHECK: util.return %c1, %c4
+  util.return %0#0, %0#1 : index, index
 }
 
 // -----
 
 // CHECK-LABEL: @rangeExtentsFoldConstantsDynamic
-func.func @rangeExtentsFoldConstantsDynamic(%arg0: index, %arg1: index) -> (index, index) {
+util.func public @rangeExtentsFoldConstantsDynamic(%arg0: index, %arg1: index) -> (index, index) {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
   %c3 = arith.constant 3 : index
@@ -66,25 +66,25 @@
   // CHECK: %[[RANGE_MIN:.+]] = arith.minui %arg0, %c1
   // CHECK: %[[RANGE_MAX:.+]] = arith.maxui %[[RANGE_MAX_INC]], %c4
   %0:2 = util.range.extents [%c1 for %c2], [%arg0 for %arg1], [%c2 for %c3] : index
-  // CHECK: return %[[RANGE_MIN]], %[[RANGE_MAX]]
-  return %0#0, %0#1 : index, index
+  // CHECK: util.return %[[RANGE_MIN]], %[[RANGE_MAX]]
+  util.return %0#0, %0#1 : index, index
 }
 
 // -----
 
 // CHECK-LABEL: @rangeExtentsExpand1
-func.func @rangeExtentsExpand1(%arg0: index, %arg1: index) -> (index, index) {
+util.func public @rangeExtentsExpand1(%arg0: index, %arg1: index) -> (index, index) {
   // CHECK: %[[RANGE_MAX_EXC:.+]] = arith.addi %arg0, %arg1
   // CHECK: %[[RANGE_MAX_INC:.+]] = arith.subi %[[RANGE_MAX_EXC]], %c1
   %0:2 = util.range.extents [%arg0 for %arg1] : index
-  // CHECK: return %arg0, %[[RANGE_MAX_INC]]
-  return %0#0, %0#1 : index, index
+  // CHECK: util.return %arg0, %[[RANGE_MAX_INC]]
+  util.return %0#0, %0#1 : index, index
 }
 
 // -----
 
 // CHECK-LABEL: @rangeExtentsExpand2
-func.func @rangeExtentsExpand2(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index) {
+util.func public @rangeExtentsExpand2(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index) {
   // CHECK: %[[RANGE_MIN:.+]] = arith.minui %arg0, %arg2
   // CHECK: %[[RANGE0_MAX_EXC:.+]] = arith.addi %arg0, %arg1
   // CHECK: %[[RANGE0_MAX_INC:.+]] = arith.subi %[[RANGE0_MAX_EXC]], %c1
@@ -92,15 +92,15 @@
   // CHECK: %[[RANGE1_MAX_INC:.+]] = arith.subi %[[RANGE1_MAX_EXC]], %c1
   // CHECK: %[[RANGE_MAX:.+]] = arith.maxui %[[RANGE0_MAX_INC]], %[[RANGE1_MAX_INC]]
   %0:2 = util.range.extents [%arg0 for %arg1], [%arg2 for %arg3] : index
-  // CHECK: return %[[RANGE_MIN]], %[[RANGE_MAX]]
-  return %0#0, %0#1 : index, index
+  // CHECK: util.return %[[RANGE_MIN]], %[[RANGE_MAX]]
+  util.return %0#0, %0#1 : index, index
 }
 
 // -----
 
 // CHECK-LABEL: @rangeExtentsDeduplicate
-func.func @rangeExtentsDeduplicate(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index) {
+util.func public @rangeExtentsDeduplicate(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) -> (index, index) {
   // CHECK: = util.range.extents [%arg0 for %arg1], [%arg2 for %arg3], [%arg4 for %arg5] : index
   %0:2 = util.range.extents [%arg0 for %arg1], [%arg2 for %arg3], [%arg0 for %arg1], [%arg4 for %arg5] : index
-  return %0#0, %0#1 : index, index
+  util.return %0#0, %0#1 : index, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/range_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/range_ops.mlir
index db297c7..c1b8209 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/range_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/range_ops.mlir

@@ -1,32 +1,32 @@
 // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @rangeMin
-func.func @rangeMin(%arg0: index, %arg1: index, %arg2: index) {
+util.func public @rangeMin(%arg0: index, %arg1: index, %arg2: index) {
   // CHECK: = util.range.min %arg0 : index
   %0 = util.range.min %arg0 : index
   // CHECK: = util.range.min %arg0, %arg1, %arg2 : index
   %1 = util.range.min %arg0, %arg1, %arg2 : index
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @rangeMax
-func.func @rangeMax(%arg0: index, %arg1: index, %arg2: index) {
+util.func public @rangeMax(%arg0: index, %arg1: index, %arg2: index) {
   // CHECK: = util.range.max %arg0 : index
   %0 = util.range.max %arg0 : index
   // CHECK: = util.range.max %arg0, %arg1, %arg2 : index
   %1 = util.range.max %arg0, %arg1, %arg2 : index
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @rangeExtents
-func.func @rangeExtents(%arg0: index, %arg1: index, %arg2: index) {
+util.func public @rangeExtents(%arg0: index, %arg1: index, %arg2: index) {
   // CHECK: = util.range.extents [%arg0 for %arg2] : index
   %0:2 = util.range.extents [%arg0 for %arg2] : index
   // CHECK: = util.range.extents [%arg0 for %arg2], [%arg1 for %arg2] : index
   %1:2 = util.range.extents [%arg0 for %arg2], [%arg1 for %arg2] : index
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/IR/test/structural_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/IR/test/structural_ops.mlir
index 12f7b6c..8624d84 100644
--- a/compiler/src/iree/compiler/Dialect/Util/IR/test/structural_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/IR/test/structural_ops.mlir

@@ -95,7 +95,7 @@
 util.func @basicCall(%arg0: tensor<?xf32>) -> (tensor<?xf32>, i32) {
   // CHECK: %[[CALL:.+]]:2 = util.call @basicExtern(%[[ARG0]]) : (tensor<?xf32>) -> (tensor<?xf32>, i32)
   %call:2 = util.call @basicExtern(%arg0) : (tensor<?xf32>) -> (tensor<?xf32>, i32)
-  // CHECK: return %[[CALL]]#0, %[[CALL]]#1
+  // CHECK: util.return %[[CALL]]#0, %[[CALL]]#1
   util.return %call#0, %call#1 : tensor<?xf32>, i32
 }
 
@@ -109,7 +109,7 @@
 util.func @inplaceCall(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: %[[CALL:.+]] = util.call @inplaceExtern(%[[ARG0]]) : (tensor<?xf32>) -> %[[ARG0]]
   %call = util.call @inplaceExtern(%arg0) : (tensor<?xf32>) -> %arg0
-  // CHECK: return %[[CALL]]
+  // CHECK: util.return %[[CALL]]
   util.return %call : tensor<?xf32>
 }
 
@@ -123,6 +123,6 @@
 util.func public @inplaceTypeChangeCall(%arg0: tensor<?x4xf32>) -> tensor<4x?xi32> {
   // CHECK: %[[CALL:.+]] = util.call @inplaceTypeChangeExtern(%[[ARG0]]) : (tensor<?x4xf32>) -> %[[ARG0]] as tensor<4x?xi32>
   %call = util.call @inplaceTypeChangeExtern(%arg0) : (tensor<?x4xf32>) -> %arg0 as tensor<4x?xi32>
-  // CHECK: return %[[CALL]]
+  // CHECK: util.return %[[CALL]]
   util.return %call : tensor<4x?xi32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/IPO.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/IPO.cpp
index 8a1123e..5b5300b 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/IPO.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/IPO.cpp

@@ -18,7 +18,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
@@ -46,9 +45,9 @@
 // callees for example.
 struct FuncAnalysis {
   // Function under analysis.
-  func::FuncOp funcOp;
+  IREE::Util::FuncOp funcOp;
   // All call sites across the whole program.
-  SmallVector<func::CallOp> callOps;
+  SmallVector<IREE::Util::CallOp> callOps;
 
   // Whether this function may be accessed indirectly or used externally.
   // This generally disables optimizations.
@@ -131,13 +130,15 @@
 };
 
 // Note that the analysis results may be incomplete.
-static FuncAnalysis analyzeFuncOp(func::FuncOp funcOp, Explorer &explorer) {
+static FuncAnalysis analyzeFuncOp(IREE::Util::FuncOp funcOp,
+                                  Explorer &explorer) {
   // Gather callers from across the program.
   FuncAnalysis analysis;
   analysis.funcOp = funcOp;
   analysis.isIncomplete = funcOp.isPublic() || funcOp.isExternal();
   if (explorer.walkIncomingCalls(funcOp, [&](mlir::CallOpInterface callOp) {
-        if (auto funcCallOp = dyn_cast<func::CallOp>((Operation *)callOp)) {
+        if (auto funcCallOp =
+                dyn_cast<IREE::Util::CallOp>((Operation *)callOp)) {
           analysis.callOps.push_back(funcCallOp);
         } else {
           analysis.isIncomplete = true;
@@ -147,6 +148,11 @@
     analysis.isIncomplete = true;
   }
 
+  // TODO(benvanik): support functions with tied operands.
+  if (funcOp.hasAnyTiedOperands()) {
+    analysis.isIncomplete = true;
+  }
+
   // Presize data types so we can index them freely below.
   unsigned argCount = funcOp.getNumArguments();
   unsigned resultCount = funcOp.getNumResults();
@@ -168,7 +174,7 @@
 
   // Walk all return sites in the function.
   SmallVector<Value> seenResultValues(resultCount);
-  funcOp.walk([&](func::ReturnOp returnOp) {
+  funcOp.walk([&](IREE::Util::ReturnOp returnOp) {
     for (auto [i, value] : llvm::enumerate(returnOp.getOperands())) {
       // Check to see if the value returned is a constant and stash.
       // We'll only use this value if all return sites are uniform.
@@ -332,7 +338,7 @@
     auto arg = funcOp.getArgument(argIndex);
     bool onlyReturnUsers = true;
     for (auto user : arg.getUsers()) {
-      if (!isa<func::ReturnOp>(user)) {
+      if (!isa<IREE::Util::ReturnOp>(user)) {
         onlyReturnUsers = false;
         break;
       }
@@ -400,7 +406,8 @@
 }
 
 // Returns true if any changes were made.
-static bool applyFuncChanges(FuncAnalysis &analysis, func::FuncOp funcOp) {
+static bool applyFuncChanges(FuncAnalysis &analysis,
+                             IREE::Util::FuncOp funcOp) {
   // Build the new set of function arguments and inline uniform constants.
   auto builder = OpBuilder::atBlockBegin(&funcOp.getBlocks().front());
   auto oldArgTypes = llvm::to_vector(funcOp.getArgumentTypes());
@@ -464,7 +471,7 @@
     return false;
 
   // Erase dead results from all return sites.
-  funcOp.walk([&](func::ReturnOp returnOp) {
+  funcOp.walk([&](IREE::Util::ReturnOp returnOp) {
     for (int i = deadResults.size() - 1; i >= 0; --i) {
       if (deadResults.test(i))
         returnOp.getOperandsMutable().erase(i);
@@ -481,7 +488,8 @@
 }
 
 // Returns true if any changes were made.
-static bool applyCallChanges(FuncAnalysis &analysis, func::CallOp callOp) {
+static bool applyCallChanges(FuncAnalysis &analysis,
+                             IREE::Util::CallOp callOp) {
   // Build the new set of call operands.
   SmallVector<Value> oldOperands = callOp.getOperands();
   SmallVector<Value> newOperands;
@@ -551,8 +559,10 @@
     return false;
 
   // Fully replace call op because we may have changed result count.
-  auto newCallOp = OpBuilder(callOp).create<func::CallOp>(
-      callOp.getLoc(), callOp.getCalleeAttr(), newResultTypes, newOperands);
+  // TODO(benvanik): update tied operands.
+  auto newCallOp = OpBuilder(callOp).create<IREE::Util::CallOp>(
+      callOp.getLoc(), newResultTypes, callOp.getCalleeAttr(), newOperands,
+      /*tied_operands=*/ArrayAttr{});
   newCallOp->setDialectAttrs(callOp->getDialectAttrs());
 
   // Remap live old results -> new results.
@@ -589,7 +599,7 @@
     // across the whole program we can't perform any mutations during this
     // analysis.
     std::vector<FuncAnalysis> analysisResults;
-    for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
+    for (auto funcOp : moduleOp.getOps<IREE::Util::FuncOp>()) {
       analysisResults.push_back(analyzeFuncOp(funcOp, explorer));
     }
 

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/PropagateSubranges.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/PropagateSubranges.cpp
index b5d0a20..f8d1cbf 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/PropagateSubranges.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/PropagateSubranges.cpp

@@ -16,7 +16,6 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -54,7 +53,8 @@
 // already exist offset globals as duplicates will get added and we'll need to
 // rely on global fusion to get rid of them. Note that this only expands globals
 // and does not yet update use sites - we just need the ops to reference.
-static ExpandedGlobalMap expandResourceGlobals(Operation *rootOp) {
+static ExpandedGlobalMap expandResourceGlobals(Operation *rootOp,
+                                               SymbolTable &symbolTable) {
   ExpandedGlobalMap expandedGlobals;
 
   // Gather all of the resource globals in the root.
@@ -67,7 +67,6 @@
   }
 
   // Expand each global by adding the offset right next to it.
-  SymbolTable symbolTable(rootOp);
   auto indexType = IndexType::get(rootOp->getContext());
   for (auto &it : expandedGlobals) {
     auto &global = it.second;
@@ -112,21 +111,25 @@
          llvm::any_of(op->getResultTypes(), isResourceType);
 }
 
+static void expandType(Type type, SmallVectorImpl<Type> &newTypes) {
+  newTypes.push_back(type);
+  if (isResourceType(type)) {
+    auto indexType = IndexType::get(type.getContext());
+    newTypes.push_back(indexType); // resource size
+    newTypes.push_back(indexType); // subrange offset
+    newTypes.push_back(indexType); // subrange length
+  }
+}
+
 // Expands resources in the given |types| list to (resource, size, offset, len).
 // This could be changed to some iterator magic to avoid the alloc.
 static SmallVector<Type> expandTypes(TypeRange types) {
   if (types.empty())
     return {};
-  auto indexType = IndexType::get(types.front().getContext());
   SmallVector<Type> newTypes;
   newTypes.reserve(types.size() * 2);
   for (auto type : types) {
-    newTypes.push_back(type);
-    if (isResourceType(type)) {
-      newTypes.push_back(indexType); // resource size
-      newTypes.push_back(indexType); // subrange offset
-      newTypes.push_back(indexType); // subrange length
-    }
+    expandType(type, newTypes);
   }
   return newTypes;
 }
@@ -178,6 +181,22 @@
   }
 }
 
+static void expandOperand(Location loc, Value operand,
+                          SmallVectorImpl<Value> &newOperands,
+                          SubrangeMap &subrangeMap, IndexSet &indexSet,
+                          OpBuilder &builder) {
+  if (isResourceType(operand.getType())) {
+    auto subrange =
+        consumeSubrange(loc, operand, subrangeMap, indexSet, builder);
+    newOperands.push_back(subrange.resource);
+    newOperands.push_back(subrange.resourceSize);
+    newOperands.push_back(subrange.subrangeOffset);
+    newOperands.push_back(subrange.subrangeLength);
+  } else {
+    newOperands.push_back(operand);
+  }
+}
+
 // Expands resources in |operands| into (resource, size, offset, length) tuples.
 static SmallVector<Value> expandOperands(Location loc, ValueRange operands,
                                          SubrangeMap &subrangeMap,
@@ -186,29 +205,21 @@
   SmallVector<Value> result;
   result.reserve(operands.size() * 2);
   for (auto operand : operands) {
-    if (isResourceType(operand.getType())) {
-      auto subrange =
-          consumeSubrange(loc, operand, subrangeMap, indexSet, builder);
-      result.push_back(subrange.resource);
-      result.push_back(subrange.resourceSize);
-      result.push_back(subrange.subrangeOffset);
-      result.push_back(subrange.subrangeLength);
-    } else {
-      result.push_back(operand);
-    }
+    expandOperand(loc, operand, result, subrangeMap, indexSet, builder);
   }
   return result;
 }
 
-static void expandSubranges(Operation *op, ExpandedGlobalMap &globalMap,
-                            IndexSet &indexSet, SubrangeMap &subrangeMap);
+static void expandSubranges(Operation *op, SymbolTable &symbolTable,
+                            ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                            SubrangeMap &subrangeMap);
 
 // Recursively expands resources into (resource, size, offset, length) tuples
 // within the given |region|. All branches, ops, and nested regions will be
 // processed.
 static void expandRegion(Region &region, bool canModifyEntryBlock,
-                         ExpandedGlobalMap &globalMap, IndexSet &indexSet,
-                         SubrangeMap subrangeMap) {
+                         SymbolTable &symbolTable, ExpandedGlobalMap &globalMap,
+                         IndexSet &indexSet, SubrangeMap subrangeMap) {
   if (region.empty())
     return;
 
@@ -255,14 +266,14 @@
   if (region.hasOneBlock()) {
     for (auto &op :
          llvm::make_early_inc_range(region.front().getOperations())) {
-      expandSubranges(&op, globalMap, indexSet, subrangeMap);
+      expandSubranges(&op, symbolTable, globalMap, indexSet, subrangeMap);
     }
   } else {
     DominanceInfo domInfo(region.getParentOp());
     for (auto *blockInfo : llvm::breadth_first(domInfo.getRootNode(&region))) {
       auto *block = blockInfo->getBlock();
       for (auto &op : llvm::make_early_inc_range(block->getOperations())) {
-        expandSubranges(&op, globalMap, indexSet, subrangeMap);
+        expandSubranges(&op, symbolTable, globalMap, indexSet, subrangeMap);
       }
     }
   }
@@ -270,10 +281,12 @@
 
 // Recursively expands all regions on the op.
 static void expandRegions(Operation *op, bool canModifyEntryBlock,
+                          SymbolTable &symbolTable,
                           ExpandedGlobalMap &globalMap, IndexSet &indexSet,
                           SubrangeMap subrangeMap) {
   for (auto &region : op->getRegions()) {
-    expandRegion(region, canModifyEntryBlock, globalMap, indexSet, subrangeMap);
+    expandRegion(region, canModifyEntryBlock, symbolTable, globalMap, indexSet,
+                 subrangeMap);
   }
 }
 
@@ -385,23 +398,11 @@
 }
 
 static void expandInitializerOp(IREE::Util::InitializerOp op,
+                                SymbolTable &symbolTable,
                                 ExpandedGlobalMap &globalMap,
                                 IndexSet &indexSet, SubrangeMap &subrangeMap) {
-  expandRegion(op.getRegion(), /*canModifyEntryBlock=*/false, globalMap,
-               indexSet, subrangeMap);
-}
-
-// Returns true if |op| is either public and visible to external modules or
-// external and resolved later on. We can't modify their signatures.
-static bool isPublicOrExternal(CallableOpInterface callableOp) {
-  if (auto symbolOp = dyn_cast<SymbolOpInterface>(callableOp.getOperation())) {
-    if (symbolOp.isPublic())
-      return true;
-  }
-  auto *region = callableOp.getCallableRegion();
-  if (!region || region->empty())
-    return true;
-  return false;
+  expandRegion(op.getRegion(), /*canModifyEntryBlock=*/false, symbolTable,
+               globalMap, indexSet, subrangeMap);
 }
 
 // Inserts subranges on resource arguments.
@@ -412,25 +413,26 @@
 // sites don't need a wait.
 //
 // Example:
-//  func.func @foo(%0: !stream.resource)
+//  util.func @foo(%0: !stream.resource)
 //  ->
-//  func.func @foo(%0: !stream.resource, %sz: index, %o: index, %l: index) {
+//  util.func @foo(%0: !stream.resource, %sz: index, %o: index, %l: index) {
 //    %1 = stream.resource.subview %0[%o] : {%sz} -> {%l}
-static void expandFuncOp(mlir::func::FuncOp op, ExpandedGlobalMap &globalMap,
-                         IndexSet &indexSet, SubrangeMap &subrangeMap) {
+static void expandFuncOp(IREE::Util::FuncOp op, SymbolTable &symbolTable,
+                         ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                         SubrangeMap &subrangeMap) {
   // Ignore public/external function signatures but still convert regions.
-  bool canModifyEntryBlock = !isPublicOrExternal(op);
+  bool canModifyEntryBlock = !IREE::Util::isPublicOrExternal(op);
   if (canModifyEntryBlock) {
-    auto oldType = op.getFunctionType();
-    auto inputTypes = expandTypes(oldType.getInputs());
-    auto resultTypes = expandTypes(oldType.getResults());
-    auto newType = FunctionType::get(op.getContext(), inputTypes, resultTypes);
-    if (newType != oldType) {
-      op.setType(newType);
-    }
+    op.expandSignature(
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          expandType(type, newTypes);
+        },
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          expandType(type, newTypes);
+        });
   }
-  expandRegion(op.getRegion(), canModifyEntryBlock, globalMap, indexSet,
-               subrangeMap);
+  expandRegion(op.getRegion(), canModifyEntryBlock, symbolTable, globalMap,
+               indexSet, subrangeMap);
 }
 
 // Splits resource operands and results into (resource, resourceSize,
@@ -443,28 +445,31 @@
 //
 // Example:
 //  %1 = stream.resource.subview %0[%o] : {%sz} -> {%l}
-//  %r = call @foo(%1)
+//  %r = util.call @foo(%1)
 //  ->
-//  %r, %rsz, %ro, %rl = call @foo(%0, %sz, %o, %l)
+//  %r, %rsz, %ro, %rl = util.call @foo(%0, %sz, %o, %l)
 //  %2 = stream.resource.subview %r[%ro] : {%rsz} -> {%rl}
-static void expandCallOp(mlir::func::CallOp op, IndexSet &indexSet,
-                         SubrangeMap &subrangeMap) {
+static void expandCallOp(IREE::Util::CallOp op, SymbolTable &symbolTable,
+                         IndexSet &indexSet, SubrangeMap &subrangeMap) {
   if (!usesResources(op))
     return;
 
   // Ignore calls to public/external functions.
-  auto calleeOp = SymbolTable::lookupNearestSymbolFrom<CallableOpInterface>(
-      op, op.getCalleeAttr());
-  if (isPublicOrExternal(calleeOp))
+  auto calleeOp = symbolTable.lookup<CallableOpInterface>(op.getCallee());
+  if (IREE::Util::isPublicOrExternal(calleeOp))
     return;
 
   // Build the new call op with expanded operands and results.
   OpBuilder builder(op);
-  auto operands = expandOperands(op.getLoc(), op.getOperands(), subrangeMap,
-                                 indexSet, builder);
-  auto resultTypes = expandTypes(op.getResultTypes());
-  auto newOp = builder.create<mlir::func::CallOp>(op.getLoc(), op.getCallee(),
-                                                  resultTypes, operands);
+  auto newOp = op.cloneAndExpand(
+      [&](unsigned i, Value operand, SmallVectorImpl<Value> &newOperands) {
+        expandOperand(op.getLoc(), operand, newOperands, subrangeMap, indexSet,
+                      builder);
+      },
+      [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+        expandType(type, newTypes);
+      },
+      builder);
 
   // Insert subranges on results that we are sinking across the call edge.
   // The hope is that by moving the subranges here we can fold with uses inside
@@ -499,19 +504,19 @@
 //
 // Example:
 //  %1 = stream.resource.subview %0[%o] : {%sz} -> {%l}
-//  return %1
+//  util.return %1
 //  ->
-//  return %0, %sz, %o, %l
-static void expandReturnOp(mlir::func::ReturnOp op, IndexSet &indexSet,
+//  util.return %0, %sz, %o, %l
+static void expandReturnOp(IREE::Util::ReturnOp op, IndexSet &indexSet,
                            SubrangeMap &subrangeMap) {
   if (!usesResources(op))
     return;
-  if (isPublicOrExternal(op->getParentOfType<mlir::func::FuncOp>()))
+  if (IREE::Util::isPublicOrExternal(op->getParentOfType<IREE::Util::FuncOp>()))
     return;
   OpBuilder builder(op);
   auto operands = expandOperands(op.getLoc(), op.getOperands(), subrangeMap,
                                  indexSet, builder);
-  builder.create<mlir::func::ReturnOp>(op.getLoc(), operands);
+  builder.create<IREE::Util::ReturnOp>(op.getLoc(), operands);
   op.erase();
 }
 
@@ -573,8 +578,9 @@
 
 // Recursively expands resources into (resource, size, offset, length) in |op|.
 // TODO(benvanik): make this a type switch.
-static void expandSubranges(Operation *op, ExpandedGlobalMap &globalMap,
-                            IndexSet &indexSet, SubrangeMap &subrangeMap) {
+static void expandSubranges(Operation *op, SymbolTable &symbolTable,
+                            ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                            SubrangeMap &subrangeMap) {
   if (auto subrangeOp = dyn_cast<IREE::Util::SubrangeOpInterface>(op)) {
     return updateSubrangeOp(subrangeOp, indexSet, subrangeMap);
   }
@@ -584,12 +590,13 @@
   } else if (auto storeOp = dyn_cast<IREE::Util::GlobalStoreOpInterface>(op)) {
     return expandGlobalStoreOp(storeOp, globalMap, indexSet, subrangeMap);
   } else if (auto initializerOp = dyn_cast<IREE::Util::InitializerOp>(op)) {
-    return expandInitializerOp(initializerOp, globalMap, indexSet, subrangeMap);
-  } else if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
-    return expandFuncOp(funcOp, globalMap, indexSet, subrangeMap);
-  } else if (auto callOp = dyn_cast<mlir::func::CallOp>(op)) {
-    return expandCallOp(callOp, indexSet, subrangeMap);
-  } else if (auto returnOp = dyn_cast<mlir::func::ReturnOp>(op)) {
+    return expandInitializerOp(initializerOp, symbolTable, globalMap, indexSet,
+                               subrangeMap);
+  } else if (auto funcOp = dyn_cast<IREE::Util::FuncOp>(op)) {
+    return expandFuncOp(funcOp, symbolTable, globalMap, indexSet, subrangeMap);
+  } else if (auto callOp = dyn_cast<IREE::Util::CallOp>(op)) {
+    return expandCallOp(callOp, symbolTable, indexSet, subrangeMap);
+  } else if (auto returnOp = dyn_cast<IREE::Util::ReturnOp>(op)) {
     return expandReturnOp(returnOp, indexSet, subrangeMap);
   } else if (auto branchOp = dyn_cast<mlir::cf::BranchOp>(op)) {
     return expandBranchOp(branchOp, indexSet, subrangeMap);
@@ -604,14 +611,14 @@
   // We could add an interface to ops we want to do this to, though, to at least
   // allow dialects to plug in. For now we just need SCF so this is hardcoded.
   if (auto ifOp = dyn_cast<mlir::scf::IfOp>(op)) {
-    return expandRegions(ifOp, /*canModifyEntryBlock=*/false, globalMap,
-                         indexSet, subrangeMap);
+    return expandRegions(ifOp, /*canModifyEntryBlock=*/false, symbolTable,
+                         globalMap, indexSet, subrangeMap);
   } else if (auto forOp = dyn_cast<mlir::scf::ForOp>(op)) {
-    return expandRegions(forOp, /*canModifyEntryBlock=*/false, globalMap,
-                         indexSet, subrangeMap);
+    return expandRegions(forOp, /*canModifyEntryBlock=*/false, symbolTable,
+                         globalMap, indexSet, subrangeMap);
   } else if (auto whileOp = dyn_cast<mlir::scf::WhileOp>(op)) {
-    return expandRegions(whileOp, /*canModifyEntryBlock=*/false, globalMap,
-                         indexSet, subrangeMap);
+    return expandRegions(whileOp, /*canModifyEntryBlock=*/false, symbolTable,
+                         globalMap, indexSet, subrangeMap);
   }
   // TODO(benvanik): also handle scf.yield: today we don't propagate across
   // return values.
@@ -634,16 +641,16 @@
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<mlir::arith::ArithDialect>();
-    registry.insert<mlir::func::FuncDialect>();
     registry.insert<mlir::scf::SCFDialect>();
     registry.insert<IREE::Util::UtilDialect>();
   }
 
   void runOnOperation() override {
     auto rootOp = getOperation();
+    SymbolTable symbolTable(rootOp);
 
     // Expand all util.global ops holding resources into resource and subrange.
-    auto globalMap = expandResourceGlobals(rootOp);
+    auto globalMap = expandResourceGlobals(rootOp, symbolTable);
 
     // Walk the entire IR tree and expand the globals.
     // We could do this via pattern application but that gets much trickier to
@@ -658,7 +665,8 @@
       IndexSet indexSet(callableOp.getLoc(),
                         OpBuilder::atBlockBegin(&region->front()));
       SubrangeMap subrangeMap;
-      expandSubranges(callableOp, globalMap, indexSet, subrangeMap);
+      expandSubranges(callableOp, symbolTable, globalMap, indexSet,
+                      subrangeMap);
     }
   }
 };

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/TestConversion.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/TestConversion.cpp
index 30f3057..1a39957 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/TestConversion.cpp
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/TestConversion.cpp

@@ -12,7 +12,6 @@
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -26,9 +25,9 @@
   TestConversionPass() = default;
   TestConversionPass(const TestConversionPass &) {}
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<IREE::Util::UtilDialect, func::FuncDialect,
-                    mlir::arith::ArithDialect, math::MathDialect,
-                    mlir::affine::AffineDialect, memref::MemRefDialect>();
+    registry.insert<IREE::Util::UtilDialect, mlir::arith::ArithDialect,
+                    math::MathDialect, mlir::affine::AffineDialect,
+                    memref::MemRefDialect>();
   }
 
   void runOnOperation() override {

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/combine_initializers.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/combine_initializers.mlir
index a69a09d..df2ce82 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/combine_initializers.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/combine_initializers.mlir

@@ -2,12 +2,12 @@
 
 // Tests that multiple initializers are combined in their module order.
 
-func.func private @extern() -> index
+util.func private @extern() -> index
 
 // CHECK: util.global private mutable @global0 : index
 util.global private mutable @global0 : index
 util.initializer {
-  %value0 = func.call @extern() : () -> index
+  %value0 = util.call @extern() : () -> index
   util.global.store %value0, @global0 : index
   util.return
 }
@@ -16,28 +16,28 @@
 // CHECK-NEXT: util.global private @global2 : index
 util.global private @global2 : index
 util.initializer {
-  %value1 = func.call @extern() : () -> index
+  %value1 = util.call @extern() : () -> index
   util.global.store %value1, @global1 : index
-  %value2 = func.call @extern() : () -> index
+  %value2 = util.call @extern() : () -> index
   util.global.store %value2, @global2 : index
   util.return
 }
 // CHECK-NEXT: util.initializer {
-// CHECK-NEXT: %[[VALUE0:.+]] = func.call @extern()
+// CHECK-NEXT: %[[VALUE0:.+]] = util.call @extern()
 // CHECK-NEXT: util.global.store %[[VALUE0]], @global0
-// CHECK-NEXT: %[[VALUE1:.+]] = func.call @extern()
+// CHECK-NEXT: %[[VALUE1:.+]] = util.call @extern()
 // CHECK-NEXT: util.global.store %[[VALUE1]], @global1
-// CHECK-NEXT: %[[VALUE2:.+]] = func.call @extern()
+// CHECK-NEXT: %[[VALUE2:.+]] = util.call @extern()
 // CHECK-NEXT: util.global.store %[[VALUE2]], @global2
 // CHECK-NEXT: util.return
 
 // CHECK-LABEL: @orderedCombining
-func.func @orderedCombining(%arg0: index) -> (index, index, index) {
+util.func @orderedCombining(%arg0: index) -> (index, index, index) {
   util.global.store %arg0, @global0 : index
   %value0 = util.global.load @global0 : index
   %value1 = util.global.load @global1 : index
   %value2 = util.global.load @global2 : index
-  return %value0, %value1, %value2 : index, index, index
+  util.return %value0, %value1, %value2 : index, index, index
 }
 
 // -----

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_f32_to_f16.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_f32_to_f16.mlir
index 6285d5e..55eee1e 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_f32_to_f16.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_f32_to_f16.mlir

@@ -3,26 +3,26 @@
 // NOTE: for more comprehensive tests see demote_i64_to_i32.mlir.
 
 //       CHECK: util.global {{.*}} : tensor<4xf16>
-// CHECK-LABEL: func.func @simple_f32() -> tensor<4xf16>
+// CHECK-LABEL: util.func public @simple_f32() -> tensor<4xf16>
 //  CHECK-NEXT: %{{.*}} = util.global.address @__global : !util.ptr<tensor<4xf16>>
 //  CHECK-NEXT: %{{.*}} = util.global.load.indirect %{{.*}} : !util.ptr<tensor<4xf16>> -> tensor<4xf16>
-//  CHECK-NEXT: return %{{.*}} : tensor<4xf16>
+//  CHECK-NEXT: util.return %{{.*}} : tensor<4xf16>
 util.global private @"__global" = dense<[1.000000e+01, 5.000000e+00, 1.000000e+01, 5.000000e+00]> : tensor<4xf32>
-func.func @simple_f32() -> (tensor<4xf32>) {
+util.func public @simple_f32() -> (tensor<4xf32>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xf32>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xf32>> -> tensor<4xf32>
-  return %1 : tensor<4xf32>
+  util.return %1 : tensor<4xf32>
 }
 
 // -----
 
 // CHECK: util.global
 // CHECK-NOT: f32
-// CHECK-LABEL: func.func @nested_region_f32()
+// CHECK-LABEL: util.func public @nested_region_f32()
 // CHECK-NOT: f32
-// CHECK: return %{{.*}} : tensor<?xf16>
+// CHECK: util.return %{{.*}} : tensor<?xf16>
 util.global private @"__global" = dense<[1.000000e+01, 5.000000e+00, 1.000000e+01, 5.000000e+00]> : tensor<4xf32>
-func.func @nested_region_f32() -> (tensor<?xf32>) {
+util.func public @nested_region_f32() -> (tensor<?xf32>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xf32>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xf32>> -> tensor<4xf32>
   %c4 = arith.constant 4 : index
@@ -31,5 +31,5 @@
     %element = tensor.extract %1[%arg0] : tensor<4xf32>
     tensor.yield %element : f32
   } : tensor<?xf32>
-  return %2 : tensor<?xf32>
+  util.return %2 : tensor<?xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_f64_to_f32.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_f64_to_f32.mlir
index ceb926b..bd481ce 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_f64_to_f32.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_f64_to_f32.mlir

@@ -2,46 +2,46 @@
 
 // NOTE: for more comprehensive tests see demote_i64_to_i32.mlir.
 
-// CHECK-LABEL: func.func @constantF64
+// CHECK-LABEL: util.func public @constantF64
 // CHECK-SAME: () -> f32
-func.func @constantF64() -> f64 {
+util.func public @constantF64() -> f64 {
   // CHECK-NEXT: constant 123.{{.+}} : f32
   %c1234 = arith.constant 123.4 : f64
-  return %c1234 : f64
+  util.return %c1234 : f64
 }
 
 // -----
 
-// CHECK-LABEL: func.func @tensorTypesF64
+// CHECK-LABEL: util.func public @tensorTypesF64
 // CHECK-SAME: (%arg0: tensor<4x4xf32>) -> tensor<4x4xf32>
-func.func @tensorTypesF64(%arg0 : tensor<4x4xf64>) -> tensor<4x4xf64> {
+util.func public @tensorTypesF64(%arg0 : tensor<4x4xf64>) -> tensor<4x4xf64> {
   // CHECK-NEXT: return %arg0 : tensor<4x4xf32>
-  return %arg0 : tensor<4x4xf64>
+  util.return %arg0 : tensor<4x4xf64>
 }
 
 // -----
 
 //       CHECK: util.global {{.*}} : tensor<4xf32>
-// CHECK-LABEL: func.func @simple_f64() -> tensor<4xf32>
+// CHECK-LABEL: util.func public @simple_f64() -> tensor<4xf32>
 //  CHECK-NEXT: %{{.*}} = util.global.address @__global : !util.ptr<tensor<4xf32>>
 //  CHECK-NEXT: %{{.*}} = util.global.load.indirect %{{.*}} : !util.ptr<tensor<4xf32>> -> tensor<4xf32>
-//  CHECK-NEXT: return %{{.*}} : tensor<4xf32>
+//  CHECK-NEXT: util.return %{{.*}} : tensor<4xf32>
 util.global private @"__global" = dense<[1.000000e+01, 5.000000e+00, 1.000000e+01, 5.000000e+00]> : tensor<4xf64>
-func.func @simple_f64() -> (tensor<4xf64>) {
+util.func public @simple_f64() -> (tensor<4xf64>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xf64>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xf64>> -> tensor<4xf64>
-  return %1 : tensor<4xf64>
+  util.return %1 : tensor<4xf64>
 }
 
 // -----
 
 // CHECK: util.global
 // CHECK-NOT: f64
-// CHECK-LABEL: func.func @nested_region_f64()
+// CHECK-LABEL: util.func public @nested_region_f64()
 // CHECK-NOT: f64
-// CHECK: return %{{.*}} : tensor<?xf32>
+// CHECK: util.return %{{.*}} : tensor<?xf32>
 util.global private @"__global" = dense<[1.000000e+01, 5.000000e+00, 1.000000e+01, 5.000000e+00]> : tensor<4xf64>
-func.func @nested_region_f64() -> (tensor<?xf64>) {
+util.func public @nested_region_f64() -> (tensor<?xf64>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xf64>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xf64>> -> tensor<4xf64>
   %c4 = arith.constant 4 : index
@@ -50,35 +50,35 @@
     %element = tensor.extract %1[%arg0] : tensor<4xf64>
     tensor.yield %element : f64
   } : tensor<?xf64>
-  return %2 : tensor<?xf64>
+  util.return %2 : tensor<?xf64>
 }
 
 // -----
 
 // Check handling of width-sensitive arith casts.
 
-// CHECK-LABEL:   func.func @arith.truncf(
-// CHECK-SAME:            %[[ARG0:.*]]: f32) -> f32 {
-// CHECK:           return %[[ARG0]] : f32
-func.func @arith.truncf(%arg0: f64) -> f32 {
+// CHECK-LABEL: util.func public @arith.truncf(
+// CHECK-SAME:      %[[ARG0:.*]]: f32) -> f32 {
+// CHECK:         util.return %[[ARG0]] : f32
+util.func public @arith.truncf(%arg0: f64) -> f32 {
   %0 = arith.truncf %arg0 : f64 to f32
-  return %0 : f32
+  util.return %0 : f32
 }
 
-// CHECK-LABEL:   func.func @arith.extf(
-// CHECK-SAME:            %[[ARG0:.*]]: f32) -> f32 {
-// CHECK:           return %[[ARG0]] : f32
-func.func @arith.extf(%arg0: f32) -> f64 {
+// CHECK-LABEL: util.func public @arith.extf(
+// CHECK-SAME:      %[[ARG0:.*]]: f32) -> f32 {
+// CHECK:         util.return %[[ARG0]] : f32
+util.func public @arith.extf(%arg0: f32) -> f64 {
   %0 = arith.extf %arg0 : f32 to f64
-  return %0 : f64
+  util.return %0 : f64
 }
 
 // -----
 
-// CHECK-LABEL: func.func @complexTypesF64
+// CHECK-LABEL: util.func public @complexTypesF64
 // CHECK-SAME: (%arg0: complex<f32>) -> complex<f32>
-func.func @complexTypesF64(%arg0 : complex<f64>) -> complex<f64> {
-  // CHECK-NEXT: return %arg0 : complex<f32>
-  return %arg0 : complex<f64>
+util.func public @complexTypesF64(%arg0 : complex<f64>) -> complex<f64> {
+  // CHECK-NEXT: util.return %arg0 : complex<f32>
+  util.return %arg0 : complex<f64>
 }
 

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_i64_to_i32.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_i64_to_i32.mlir
index 1aa3b54..e05669e 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_i64_to_i32.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/demote_i64_to_i32.mlir

@@ -1,58 +1,58 @@
 // RUN: iree-opt --split-input-file --allow-unregistered-dialect --iree-util-demote-i64-to-i32 %s | FileCheck %s
 
-// CHECK-LABEL: func.func @constant_i64
+// CHECK-LABEL: util.func public @constant_i64
 // CHECK-SAME: () -> i32
-func.func @constant_i64() -> i64 {
+util.func public @constant_i64() -> i64 {
   // CHECK-NEXT: constant 123 : i32
   %c123 = arith.constant 123 : i64
-  return %c123 : i64
+  util.return %c123 : i64
 }
 
 // -----
 
-// CHECK-LABEL: func.func @constant_splat_i64
+// CHECK-LABEL: util.func public @constant_splat_i64
 // CHECK-SAME: () -> tensor<4xi32>
-func.func @constant_splat_i64() -> tensor<4xi64> {
+util.func public @constant_splat_i64() -> tensor<4xi64> {
   // CHECK-NEXT: constant dense<123> : tensor<4xi32>
   %c123 = arith.constant dense<123> : tensor<4xi64>
-  return %c123 : tensor<4xi64>
+  util.return %c123 : tensor<4xi64>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @constant_dense_i64
+// CHECK-LABEL: util.func public @constant_dense_i64
 // CHECK-SAME: () -> tensor<4xi32>
-func.func @constant_dense_i64() -> tensor<4xi64> {
+util.func public @constant_dense_i64() -> tensor<4xi64> {
   // CHECK-NEXT: constant dense<[0, 1, 2, 3]> : tensor<4xi32>
   %c123 = arith.constant dense<[0, 1, 2, 3]> : tensor<4xi64>
-  return %c123 : tensor<4xi64>
+  util.return %c123 : tensor<4xi64>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @args_i64
+// CHECK-LABEL: util.func public @args_i64
 // CHECK-SAME: (%arg0: i32) -> i32
-func.func @args_i64(%arg0: i64) -> i64 {
+util.func public @args_i64(%arg0: i64) -> i64 {
   // CHECK-NEXT: return %arg0 : i32
-  return %arg0 : i64
+  util.return %arg0 : i64
 }
 
 // -----
 
-// CHECK-LABEL: func.func @args_ui64
+// CHECK-LABEL: util.func public @args_ui64
 // CHECK-SAME: (%arg0: ui32) -> ui32
-func.func @args_ui64(%arg0: ui64) -> ui64 {
+util.func public @args_ui64(%arg0: ui64) -> ui64 {
   // CHECK-NEXT: return %arg0 : ui32
-  return %arg0 : ui64
+  util.return %arg0 : ui64
 }
 
 // -----
 
-// CHECK-LABEL: func.func @args_tensor_i64
+// CHECK-LABEL: util.func public @args_tensor_i64
 // CHECK-SAME: (%arg0: tensor<4x4xi32>) -> tensor<4x4xi32>
-func.func @args_tensor_i64(%arg0: tensor<4x4xi64>) -> tensor<4x4xi64> {
+util.func public @args_tensor_i64(%arg0: tensor<4x4xi64>) -> tensor<4x4xi64> {
   // CHECK-NEXT: return %arg0 : tensor<4x4xi32>
-  return %arg0 : tensor<4x4xi64>
+  util.return %arg0 : tensor<4x4xi64>
 }
 
 // -----
@@ -60,29 +60,29 @@
 // Return types should be converted for all operations, even those that the
 // core compiler is not directly aware of.
 
-// CHECK-LABEL: func.func @custom_constant_i64
+// CHECK-LABEL: util.func public @custom_constant_i64
 // CHECK-SAME: () -> tensor<1xi32>
-func.func @custom_constant_i64() -> tensor<1xi64> {
+util.func public @custom_constant_i64() -> tensor<1xi64> {
   // CHECK-NEXT: "custom.constant"() : () -> tensor<1xi32>
   %c0 = "custom.constant"() : () -> tensor<1xi64>
-  return %c0 : tensor<1xi64>
+  util.return %c0 : tensor<1xi64>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @custom_constant_ui64
+// CHECK-LABEL: util.func public @custom_constant_ui64
 // CHECK-SAME: () -> tensor<1xui32>
-func.func @custom_constant_ui64() -> tensor<1xui64> {
+util.func public @custom_constant_ui64() -> tensor<1xui64> {
   // CHECK-NEXT: "custom.constant"() : () -> tensor<1xui32>
   %c0 = "custom.constant"() : () -> tensor<1xui64>
-  return %c0 : tensor<1xui64>
+  util.return %c0 : tensor<1xui64>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @arith_cmpi_i64
+// CHECK-LABEL: util.func public @arith_cmpi_i64
 // CHECK-SAME: (%arg0: tensor<i32>, %arg1: tensor<i32>) -> (i1, tensor<i32>)
-func.func @arith_cmpi_i64(%arg0 : tensor<i64>, %arg1 : tensor<i64>) -> (i1, tensor<i64>) {
+util.func public @arith_cmpi_i64(%arg0 : tensor<i64>, %arg1 : tensor<i64>) -> (i1, tensor<i64>) {
   // CHECK-NEXT: %0 = arith.cmpi slt, %arg0, %arg1 : tensor<i32>
   // CHECK-NEXT: %[[EXT:.*]] = tensor.extract %0[] : tensor<i1>
   // CHECK-NEXT: cf.cond_br %[[EXT]], ^bb1(%[[EXT]], %arg0 : i1, tensor<i32>), ^bb2(%[[EXT]], %arg1 : i1, tensor<i32>)
@@ -94,28 +94,28 @@
   %1 = tensor.extract %0[] : tensor<i1>
   cf.cond_br %1, ^bb1(%1, %arg0 : i1, tensor<i64>), ^bb2(%1, %arg1 : i1, tensor<i64>)
 ^bb1(%2 : i1, %3 : tensor<i64>):
-  return %2, %3 : i1, tensor<i64>
+  util.return %2, %3 : i1, tensor<i64>
 ^bb2(%4 : i1, %5 : tensor<i64>):
-  return %4, %5 : i1, tensor<i64>
+  util.return %4, %5 : i1, tensor<i64>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @linalg_matmul_i64
-func.func @linalg_matmul_i64(%arg0: tensor<2x3xi64>, %arg1: tensor<3x4xi64>, %arg2: tensor<2x4xi64>)  -> tensor<2x4xi64> {
+// CHECK-LABEL: util.func public @linalg_matmul_i64
+util.func public @linalg_matmul_i64(%arg0: tensor<2x3xi64>, %arg1: tensor<3x4xi64>, %arg2: tensor<2x4xi64>)  -> tensor<2x4xi64> {
   // CHECK: %[[T:.+]] = linalg.matmul ins(%arg0, %arg1 : tensor<2x3xi32>, tensor<3x4xi32>)
   // CHECK-SAME:                     outs(%arg2 : tensor<2x4xi32>) -> tensor<2x4xi32>
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<2x3xi64>, tensor<3x4xi64>)
                     outs(%arg2 : tensor<2x4xi64>) -> tensor<2x4xi64>
   // CHECK-NEXT: return %[[T:.+]] : tensor<2x4xi32>
-  return %0 : tensor<2x4xi64>
+  util.return %0 : tensor<2x4xi64>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @linalg_generic_i64
+// CHECK-LABEL: util.func public @linalg_generic_i64
 // CHECK-SAME: (%[[ARG:.+]]: tensor<2xi32>) -> tensor<2xi32>
-func.func @linalg_generic_i64(%arg: tensor<2xi64>)  -> tensor<2xi64> {
+util.func public @linalg_generic_i64(%arg: tensor<2xi64>)  -> tensor<2xi64> {
   // CHECK: %[[INIT:.+]] = tensor.empty() : tensor<2xi32>
   %init = tensor.empty() : tensor<2xi64>
   // CHECK: %[[T:.+]] = linalg.generic {{.+}} ins(%[[ARG]] : tensor<2xi32>) outs(%[[INIT]] : tensor<2xi32>)
@@ -126,18 +126,18 @@
     linalg.yield %arg1 : i64
   } -> tensor<2xi64>
   // CHECK: %[[T]] : tensor<2xi32>
-  return %generic : tensor<2xi64>
+  util.return %generic : tensor<2xi64>
 }
 
 // -----
 
-// CHECK-LABEL: func.func @linalg_non_structured_op
+// CHECK-LABEL: util.func public @linalg_non_structured_op
 // CHECK-SAME:    (%arg0: tensor<9xi32>) -> tensor<1x9xi32>
-func.func @linalg_non_structured_op(%arg0: tensor<9xi64>) -> tensor<1x9xi64> {
+util.func public @linalg_non_structured_op(%arg0: tensor<9xi64>) -> tensor<1x9xi64> {
   // CHECK:       %[[RES:.+]] = tensor.expand_shape %arg0 {{\[}}[0, 1]] : tensor<9xi32> into tensor<1x9xi32>
-  // CHECK:       return %[[RES:.+]] : tensor<1x9xi32>
+  // CHECK:       util.return %[[RES:.+]] : tensor<1x9xi32>
   %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<9xi64> into tensor<1x9xi64>
-  return %0 : tensor<1x9xi64>
+  util.return %0 : tensor<1x9xi64>
 }
 
 // -----
@@ -146,11 +146,11 @@
 // CHECK: util.global.load @[[VAR]]
 // CHECK: util.global.store %{{.+}}, @[[VAR]]
 util.global mutable @readwritevar = dense<0> : tensor<i64>
-func.func @foo(%arg0 : tensor<i64>) {
+util.func public @foo(%arg0 : tensor<i64>) {
   %0 = util.global.load @readwritevar : tensor<i64>
   %1 = arith.addi %0, %arg0 : tensor<i64>
   util.global.store %1, @readwritevar : tensor<i64>
-  return
+  util.return
 }
 
 // -----
@@ -158,38 +158,38 @@
 // CHECK: util.global private @{{.+}} : tensor<4xi32>
 util.global private @v_initializer : tensor<4xi64>
 util.initializer {
-  // CHECK: %[[VALUE:.+]] = func.call @initializer() : () -> tensor<4xi32>
-  %0 = func.call @initializer() : () -> tensor<4xi64>
+  // CHECK: %[[VALUE:.+]] = util.call @initializer() : () -> tensor<4xi32>
+  %0 = util.call @initializer() : () -> tensor<4xi64>
   // CHECK: util.global.store %[[VALUE]], @v_initializer : tensor<4xi32>
   util.global.store %0, @v_initializer : tensor<4xi64>
   util.return
 }
-// CHECK: func.func private @initializer() -> tensor<4xi32>
-func.func private @initializer() -> tensor<4xi64>
+// CHECK: util.func private @initializer() -> tensor<4xi32>
+util.func private @initializer() -> tensor<4xi64>
 
 // -----
 
 //       CHECK: util.global {{.*}} : tensor<4xi32>
-// CHECK-LABEL: func.func @simple_i64() -> tensor<4xi32>
+// CHECK-LABEL: util.func public @simple_i64() -> tensor<4xi32>
 //  CHECK-NEXT: %{{.*}} = util.global.address @__global : !util.ptr<tensor<4xi32>>
 //  CHECK-NEXT: %{{.*}} = util.global.load.indirect %{{.*}} : !util.ptr<tensor<4xi32>> -> tensor<4xi32>
-//  CHECK-NEXT: return %{{.*}} : tensor<4xi32>
+//  CHECK-NEXT: util.return %{{.*}} : tensor<4xi32>
 util.global private @"__global" = dense<[1, 2, 3, 4]> : tensor<4xi64>
-func.func @simple_i64() -> (tensor<4xi64>) {
+util.func public @simple_i64() -> (tensor<4xi64>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xi64>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xi64>> -> tensor<4xi64>
-  return %1 : tensor<4xi64>
+  util.return %1 : tensor<4xi64>
 }
 
 // -----
 
 // CHECK: util.global
 // CHECK-NOT: i64
-// CHECK-LABEL: func.func @nested_region_i64()
+// CHECK-LABEL: util.func public @nested_region_i64()
 // CHECK-NOT: i64
-// CHECK: return %{{.*}} : tensor<?xi32>
+// CHECK: util.return %{{.*}} : tensor<?xi32>
 util.global private @"__global" = dense<[1, 2, 3, 4]> : tensor<4xi64>
-func.func @nested_region_i64() -> (tensor<?xi64>) {
+util.func public @nested_region_i64() -> (tensor<?xi64>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xi64>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xi64>> -> tensor<4xi64>
   %c4 = arith.constant 4 : index
@@ -198,35 +198,35 @@
     %element = tensor.extract %1[%arg0] : tensor<4xi64>
     tensor.yield %element : i64
   } : tensor<?xi64>
-  return %2 : tensor<?xi64>
+  util.return %2 : tensor<?xi64>
 }
 
 // -----
 
 // Check handling of width-sensitive arith casts.
 
-// CHECK-LABEL:   func.func @arith.trunci(
-// CHECK-SAME:            %[[ARG0:.*]]: i32) -> i32 {
-// CHECK:           return %[[ARG0]] : i32
-func.func @arith.trunci(%arg0: i64) -> i32 {
+// CHECK-LABEL: util.func public @arith.trunci(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:         util.return %[[ARG0]] : i32
+util.func public @arith.trunci(%arg0: i64) -> i32 {
   %0 = arith.trunci %arg0 : i64 to i32
-  return %0 : i32
+  util.return %0 : i32
 }
 
-// CHECK-LABEL:   func.func @arith.extui(
-// CHECK-SAME:            %[[ARG0:.*]]: i32) -> i32 {
-// CHECK:           return %[[ARG0]] : i32
-func.func @arith.extui(%arg0: i32) -> i64 {
+// CHECK-LABEL: util.func public @arith.extui(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:         util.return %[[ARG0]] : i32
+util.func public @arith.extui(%arg0: i32) -> i64 {
   %0 = arith.extui %arg0 : i32 to i64
-  return %0 : i64
+  util.return %0 : i64
 }
 
-// CHECK-LABEL:   func.func @arith.extsi(
-// CHECK-SAME:            %[[ARG0:.*]]: i32) -> i32 {
-// CHECK:           return %[[ARG0]] : i32
-func.func @arith.extsi(%arg0: i32) -> i64 {
+// CHECK-LABEL: util.func public @arith.extsi(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:         util.return %[[ARG0]] : i32
+util.func public @arith.extsi(%arg0: i32) -> i64 {
   %0 = arith.extsi %arg0 : i32 to i64
-  return %0 : i64
+  util.return %0 : i64
 }
 
 // -----
@@ -236,13 +236,13 @@
 // CHECK: ml_program.global
 // CHECK-SAME: i32
 "ml_program.global"() {sym_name = "_v", sym_visibility = "private", type = tensor<2x2xi64>, value = dense<1> : tensor<2x2xi64>} : () -> ()
-func.func @run() -> tensor<2x2xi64> {
+util.func public @run() -> tensor<2x2xi64> {
   %0 = "ml_program.global_load"() {global = @_v} : () -> tensor<2x2xi64>
-  %1 = call @f(%0) : (tensor<2x2xi64>) -> tensor<2x2xi64>
-  return %1 : tensor<2x2xi64>
+  %1 = util.call @f(%0) : (tensor<2x2xi64>) -> tensor<2x2xi64>
+  util.return %1 : tensor<2x2xi64>
 }
 
-func.func private @f(%arg0: tensor<2x2xi64>) -> tensor<2x2xi64> {
-  return %arg0 : tensor<2x2xi64>
+util.func private @f(%arg0: tensor<2x2xi64>) -> tensor<2x2xi64> {
+  util.return %arg0 : tensor<2x2xi64>
 }
 

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/drop_compiler_hints.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/drop_compiler_hints.mlir
index 319bd85..717d2bf 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/drop_compiler_hints.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/drop_compiler_hints.mlir

@@ -4,18 +4,18 @@
 // If you move or delete it, please update the documentation accordingly.
 
 // CHECK-LABEL: @constant
-func.func @constant() -> i32 {
+util.func @constant() -> i32 {
   // CHECK-NEXT: %[[C1:.+]] = arith.constant 1
   %c1 = arith.constant 1 : i32
   %0 = util.optimization_barrier %c1 : i32
-  // CHECK-NEXT: return %[[C1]]
-  return %0 : i32
+  // CHECK-NEXT: util.return %[[C1]]
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @multiple
-func.func @multiple() -> (i32, i32) {
+util.func @multiple() -> (i32, i32) {
   // CHECK-NEXT: %[[C1:.+]] = arith.constant 1
   %c1 = arith.constant 1 : i32
   %0 = util.optimization_barrier %c1 : i32
@@ -24,34 +24,34 @@
   %c2 = arith.constant 2 : i32
   %2 = util.optimization_barrier %1 : i32
   %3 = util.optimization_barrier %c2 : i32
-  // CHECK-NEXT: return %[[C1]], %[[C2]]
-  return %2, %3 : i32, i32
+  // CHECK-NEXT: util.return %[[C1]], %[[C2]]
+  util.return %2, %3 : i32, i32
 }
 
 // -----
 
 // CHECK-LABEL: @multiple_operands
-func.func @multiple_operands() -> (i32, i32) {
+util.func @multiple_operands() -> (i32, i32) {
   // CHECK-NEXT: %[[C1:.+]] = arith.constant 1
   %c1 = arith.constant 1 : i32
   // CHECK-NEXT: %[[C2:.+]] = arith.constant 2
   %c2 = arith.constant 2 : i32
   %0, %1 = util.optimization_barrier %c1, %c2 : i32, i32
-  // CHECK-NEXT: return %[[C1]], %[[C2]]
-  return %0, %1 : i32, i32
+  // CHECK-NEXT: util.return %[[C1]], %[[C2]]
+  util.return %0, %1 : i32, i32
 }
 
 // -----
 
 // CHECK-LABEL: @no_fold_add
-func.func @no_fold_add() -> (i32) {
+util.func @no_fold_add() -> (i32) {
   // CHECK-NEXT: %[[C1:.+]] = arith.constant 1 : i32
   %c1 = arith.constant 1 : i32
   %0 = util.optimization_barrier %c1 : i32
   // CHECK-NEXT: %[[R:.+]] = arith.addi %[[C1]], %[[C1]]
   %1 = arith.addi %0, %0 : i32
-  // CHECK-NEXT: return %[[R]]
-  return %1 : i32
+  // CHECK-NEXT: util.return %[[R]]
+  util.return %1 : i32
 }
 
 // -----
@@ -63,12 +63,12 @@
     // CHECK-LABEL: @inner
     module @inner {
       // CHECK-LABEL: @constant
-      func.func @constant() -> i32 {
+      util.func @constant() -> i32 {
         // CHECK-NEXT: %[[C1:.+]] = arith.constant 1
         %c1 = arith.constant 1 : i32
         %0 = util.optimization_barrier %c1 : i32
-        // CHECK-NEXT: return %[[C1]]
-        return %0 : i32
+        // CHECK-NEXT: util.return %[[C1]]
+        util.return %0 : i32
       }
     }
   }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/fold_globals.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/fold_globals.mlir
index 6eaeb1b..3e38eaa 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/fold_globals.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/fold_globals.mlir

@@ -2,34 +2,34 @@
 
 // CHECK: util.global public mutable @uniformConstants = 5 : index
 util.global public mutable @uniformConstants : index
-func.func @foo() {
+util.func @foo() {
   %c5 = arith.constant 5 : index
   // CHECK-NOT: util.global.store %c5, @uniformConstants : index
   util.global.store %c5, @uniformConstants : index
-  return
+  util.return
 }
-func.func @bar() {
+util.func @bar() {
   %c5 = arith.constant 5 : index
   // CHECK-NOT: util.global.store %c5, @uniformConstants : index
   util.global.store %c5, @uniformConstants : index
-  return
+  util.return
 }
 
 // -----
 
 // CHECK: util.global public mutable @nonuniformConstants : index
 util.global public mutable @nonuniformConstants : index
-func.func @foo() {
+util.func @foo() {
   %c5 = arith.constant 5 : index
   // CHECK: util.global.store %c5, @nonuniformConstants : index
   util.global.store %c5, @nonuniformConstants : index
-  return
+  util.return
 }
-func.func @bar() {
+util.func @bar() {
   %c6 = arith.constant 6 : index
   // CHECK: util.global.store %c6, @nonuniformConstants : index
   util.global.store %c6, @nonuniformConstants : index
-  return
+  util.return
 }
 
 // -----
@@ -38,13 +38,13 @@
 util.global private mutable @chained0 : index
 // CHECK-NOT: util.global private mutable @chained1 : index
 util.global private mutable @chained1 : index
-func.func @foo() -> index {
+util.func @foo() -> index {
   // CHECK: %[[VALUE:.+]] = util.global.load immutable @chained0 : index
   %0 = util.global.load @chained0 : index
   // CHECK-NOT: util.global.store
   util.global.store %0, @chained1 : index
   // CHECK-NEXT: return %[[VALUE]]
-  return %0 : index
+  util.return %0 : index
 }
 
 // -----
@@ -53,17 +53,17 @@
 util.global public mutable @unchained0 : index
 // CHECK: util.global public mutable @unchained1 : index
 util.global public mutable @unchained1 : index
-func.func @foo() {
+util.func @foo() {
   // CHECK: %[[VALUE:.+]] = util.global.load @unchained0 : index
   %0 = util.global.load @unchained0 : index
   // CHECK: util.global.store %[[VALUE]], @unchained1 : index
   util.global.store %0, @unchained1 : index
-  return
+  util.return
 }
-func.func @bar(%arg0: index) {
+util.func @bar(%arg0: index) {
   // CHECK: util.global.store %arg0, @unchained1 : index
   util.global.store %arg0, @unchained1 : index
-  return
+  util.return
 }
 
 // -----
@@ -83,7 +83,7 @@
   util.global.store %c6, @immutable1 : index
   util.return
 }
-func.func @foo(%arg0: index) -> (index, index, index) {
+util.func @foo(%arg0: index) -> (index, index, index) {
   // CHECK-DAG: %[[C5:.+]] = arith.constant 5
   %0 = util.global.load @immutable0 : index
   // CHECK-DAG: %[[C6:.+]] = arith.constant 6
@@ -93,7 +93,7 @@
   // CHECK: util.global.store %arg0, @mutable
   util.global.store %arg0, @mutable : index
   // CHECK: return %[[C5]], %[[C6]], %[[MUTABLE]]
-  return %0, %1, %2 : index, index, index
+  util.return %0, %1, %2 : index, index, index
 }
 
 // -----
@@ -102,7 +102,7 @@
 util.global private mutable @used0 = 5 : index
 // CHECK: util.global private mutable @used1 : index
 util.global private mutable @used1 : index
-func.func @foo(%arg0: index, %arg1: index) -> (index, index) {
+util.func @foo(%arg0: index, %arg1: index) -> (index, index) {
   // CHECK: %[[VALUE0:.+]] = util.global.load @used0 : index
   %0 = util.global.load @used0 : index
   // CHECK: %[[VALUE1:.+]] = util.global.load @used1 : index
@@ -112,7 +112,7 @@
   // CHECK: util.global.store %arg1, @used1 : index
   util.global.store %arg1, @used1 : index
   // CHECK: return %[[VALUE0]], %[[VALUE1]]
-  return %0, %1 : index, index
+  util.return %0, %1 : index, index
 }
 
 // -----
@@ -134,13 +134,13 @@
 util.global private @dupeCst0 {inlining_policy = #util.inline.never} = 5 : index
 // CHECK-NOT: util.global private @dupeCst1
 util.global private @dupeCst1 {inlining_policy = #util.inline.never} = 5 : index
-func.func @foo() -> (index, index) {
+util.func @foo() -> (index, index) {
   // CHECK-DAG: %[[VALUE0:.+]] = util.global.load immutable @dupeCst0
   %0 = util.global.load @dupeCst0 : index
   // CHECK-DAG: %[[VALUE1:.+]] = util.global.load immutable @dupeCst0
   %1 = util.global.load @dupeCst1 : index
   // CHECK: return %[[VALUE0]], %[[VALUE1]]
-  return %0, %1 : index, index
+  util.return %0, %1 : index, index
 }
 
 // -----
@@ -155,11 +155,11 @@
   util.global.store %c7, @nondupeCst1 : index
   util.return
 }
-func.func @foo() -> (index, index) {
+util.func @foo() -> (index, index) {
   // CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index
   %0 = util.global.load @nondupeCst0 : index
   // CHECK-DAG: %[[C7:.+]] = arith.constant 7 : index
   %1 = util.global.load @nondupeCst1 : index
   // CHECK: return %[[C6]], %[[C7]]
-  return %0, %1 : index, index
+  util.return %0, %1 : index, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/fuse_globals.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/fuse_globals.mlir
index e77f49d..e7d0b45 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/fuse_globals.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/fuse_globals.mlir

@@ -3,7 +3,7 @@
 // CHECK: util.global private mutable @fusable0 : index
 util.global private mutable @fusable0 : index
 util.global private mutable @fusable1 : index
-func.func @foo(%arg0: index) -> (index, index) {
+util.func @foo(%arg0: index) -> (index, index) {
   // CHECK: util.global.store %arg0, @fusable0
   util.global.store %arg0, @fusable0 : index
   // CHECK-NOT: util.global.store %arg0, @fusable1
@@ -12,8 +12,8 @@
   %0 = util.global.load @fusable0 : index
   // CHECK: %[[VALUE1:.+]] = util.global.load @fusable0 : index
   %1 = util.global.load @fusable1 : index
-  // CHECK: return %[[VALUE0]], %[[VALUE1]]
-  return %0, %1 : index, index
+  // CHECK: util.return %[[VALUE0]], %[[VALUE1]]
+  util.return %0, %1 : index, index
 }
 
 // -----
@@ -24,7 +24,7 @@
 util.global private mutable @unfusable0 : index
 // CHECK: util.global private mutable @unfusable1 : index
 util.global private mutable @unfusable1 : index
-func.func @nonuniform_a(%arg0: index) -> (index, index) {
+util.func @nonuniform_a(%arg0: index) -> (index, index) {
   // CHECK: util.global.store %arg0, @unfusable0 : index
   util.global.store %arg0, @unfusable0 : index
   // CHECK: util.global.store %arg0, @unfusable1 : index
@@ -33,12 +33,12 @@
   %0 = util.global.load @unfusable0 : index
   // CHECK: %[[VALUE1:.+]] = util.global.load @unfusable1 : index
   %1 = util.global.load @unfusable1 : index
-  // CHECK: return %[[VALUE0]], %[[VALUE1]]
-  return %0, %1 : index, index
+  // CHECK: util.return %[[VALUE0]], %[[VALUE1]]
+  util.return %0, %1 : index, index
 }
-func.func @nonuniform_b(%arg0: index) {
+util.func @nonuniform_b(%arg0: index) {
   util.global.store %arg0, @unfusable0 : index
-  return
+  util.return
 }
 util.initializer {
   %0 = "some.op"() : () -> index
@@ -54,7 +54,7 @@
 util.global private mutable @unfusableInit0 = 5 : index
 // CHECK: util.global private mutable @unfusableInit1 = 6 : index
 util.global private mutable @unfusableInit1 = 6 : index
-func.func @initializer_mix(%arg0: index) -> (index, index) {
+util.func @initializer_mix(%arg0: index) -> (index, index) {
   // CHECK: util.global.store %arg0, @unfusableInit0
   util.global.store %arg0, @unfusableInit0 : index
   // CHECK: util.global.store %arg0, @unfusableInit1
@@ -63,8 +63,8 @@
   %0 = util.global.load @unfusableInit0 : index
   // CHECK: %[[VALUE1:.+]] = util.global.load @unfusableInit1 : index
   %1 = util.global.load @unfusableInit1 : index
-  // CHECK: return %[[VALUE0]], %[[VALUE1]]
-  return %0, %1 : index, index
+  // CHECK: util.return %[[VALUE0]], %[[VALUE1]]
+  util.return %0, %1 : index, index
 }
 
 // -----
@@ -73,14 +73,14 @@
 util.global private mutable @unfusableDivergent0 : index
 // CHECK: util.global private mutable @unfusableDivergent1
 util.global private mutable @unfusableDivergent1 : index
-func.func @fn_a(%arg0: index) {
+util.func @fn_a(%arg0: index) {
   util.global.store %arg0, @unfusableDivergent0 : index
   util.global.store %arg0, @unfusableDivergent1 : index
-  return
+  util.return
 }
-func.func @fn_b(%arg0: index) {
+util.func @fn_b(%arg0: index) {
   util.global.store %arg0, @unfusableDivergent0 : index
-  return
+  util.return
 }
 
 // -----
@@ -103,9 +103,9 @@
   util.global.store %v, @unfusableSubset2 : index
   util.return
 }
-// CHECK: func.func @mutate_unfusable(%[[ARG0:.+]]: index)
-func.func @mutate_unfusable(%arg0: index) {
+// CHECK: util.func public @mutate_unfusable(%[[ARG0:.+]]: index)
+util.func public @mutate_unfusable(%arg0: index) {
   // CHECK: util.global.store %[[ARG0]], @unfusableSubset2
   util.global.store %arg0, @unfusableSubset2 : index
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/hoist_into_globals.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/hoist_into_globals.mlir
index 2eacd77..d7af756 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/hoist_into_globals.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/hoist_into_globals.mlir

@@ -3,16 +3,16 @@
 // CHECK-LABEL: @hoist_simple_const_expr
 module @hoist_simple_const_expr {
   // CHECK: util.global private @[[HOISTED_SYM:.*]] : i32
-  // CHECK: func.func @main
-  func.func @main() -> (i32) {
+  // CHECK: util.func public @main
+  util.func public @main() -> (i32) {
     %0 = arith.constant 0 : i32
     %1 = arith.constant 1 : i32
     // CHECK-NOT: arith.constant
     // CHECK-NOT: iree_unregistered.const_expr
     // CHECK: %[[VAL:.*]] = util.global.load @[[HOISTED_SYM]] : i32
-    // CHECK: return %[[VAL]]
+    // CHECK: util.return %[[VAL]]
     %2 = "iree_unregistered.const_expr"(%0, %1) : (i32, i32) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
   // CHECK: util.initializer attributes {iree.compiler.consteval} {
   // CHECK:   %[[C0:.*]] = arith.constant 0 : i32
@@ -28,16 +28,16 @@
 // checks.
 // CHECK-LABEL: @do_not_hoist_variable_op
 // CHECK-NOT: util.global
-// CHECK: func.func @main
+// CHECK: util.func public @main
 // CHECK: %[[VAL:.*]] = "iree_unregistered.var_expr"
-// CHECK: return %[[VAL]]
+// CHECK: util.return %[[VAL]]
 // CHECK-NOT: util.initializer
 module @do_not_hoist_variable_op {
-  func.func @main() -> (i32) {
+  util.func public @main() -> (i32) {
     %0 = arith.constant 0 : i32
     %1 = arith.constant 1 : i32
     %2 = "iree_unregistered.var_expr"(%0, %1) : (i32, i32) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
 }
 
@@ -46,10 +46,10 @@
 // CHECK-NOT: util.global
 // CHECK-NOT: util.initializer
 module @do_not_hoist_variable_operands {
-  func.func @main(%arg0 : i32) -> (i32) {
+  util.func public @main(%arg0 : i32) -> (i32) {
     %0 = arith.constant 0 : i32
     %2 = "iree_unregistered.const_expr"(%0, %arg0) : (i32, i32) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
 }
 
@@ -58,10 +58,10 @@
 // CHECK-NOT: util.global
 // CHECK-NOT: util.initializer
 module @do_not_hoist_sub_byte_aligned_scalar_leaf {
-  func.func @main() -> (i32) {
+  util.func public @main() -> (i32) {
     %0 = arith.constant 1 : i1
     %2 = "iree_unregistered.var_expr"(%0) : (i1) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
 }
 
@@ -70,10 +70,10 @@
 // CHECK-NOT: util.global
 // CHECK-NOT: util.initializer
 module @do_not_hoist_sub_byte_aligned_tensor_leaf {
-  func.func @main() -> (i32) {
+  util.func public @main() -> (i32) {
     %0 = arith.constant dense<true> : tensor<i1>
     %2 = "iree_unregistered.var_expr"(%0) : (tensor<i1>) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
 }
 
@@ -83,10 +83,10 @@
 // Can hoist a const-expr tree that transitively includes sub-byte aligned
 // values.
 module @hoist_sub_byte_aligned_scalar_transitive {
-  func.func @main() -> (i32) {
+  util.func public @main() -> (i32) {
     %0 = arith.constant 1 : i1
     %2 = "iree_unregistered.const_expr"(%0) : (i1) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
 }
 
@@ -96,10 +96,10 @@
 // We presently expand i1 -> i8 for legacy reasons. As such, we support
 // it, even though we don't generally support sub-byte constexprs.
 module @hoist_i1_tensor_transitive {
-  func.func @main() -> (i32) {
+  util.func public @main() -> (i32) {
     %0 = arith.constant dense<true> : tensor<i1>
     %2 = "iree_unregistered.const_expr"(%0) : (tensor<i1>) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
 }
 
@@ -112,19 +112,19 @@
   // CHECK: util.global private @latent_global : i32
   util.global private @latent_global : i32
 
-  // CHECK: func.func @main
-  func.func @main() -> (i32, i32, i32) {
+  // CHECK: util.func public @main
+  util.func public @main() -> (i32, i32, i32) {
     // CHECK-DAG: %[[LOAD_HOISTED_0:.*]] = util.global.load @[[HOISTED_0]] : i32
     // CHECK-DAG: %[[LOAD_HOISTED_1:.*]] = util.global.load @[[HOISTED_1]] : i32
     // CHECK-DAG: %[[RESULT:.*]] = "iree_unregistered.var_expr"(%[[LOAD_HOISTED_1]])
-    // CHECK: return %[[LOAD_HOISTED_0]], %[[LOAD_HOISTED_1]], %[[RESULT]]
+    // CHECK: util.return %[[LOAD_HOISTED_0]], %[[LOAD_HOISTED_1]], %[[RESULT]]
     %0 = arith.constant 0 : i32
     %1 = arith.constant 1 : i32
     %2 = "iree_unregistered.const_expr"(%0, %1) : (i32, i32) -> i32
     %3 = util.global.load @latent_global : i32
     %4 = "iree_unregistered.const_expr"(%2, %3) : (i32, i32) -> i32
     %5 = "iree_unregistered.var_expr"(%4) : (i32) -> i32
-    return %2, %4, %5 : i32, i32, i32
+    util.return %2, %4, %5 : i32, i32, i32
   }
   // CHECK: util.initializer attributes {iree.compiler.consteval} {
   // CHECK:   %[[C0:.*]] = arith.constant 0 : i32
@@ -147,16 +147,16 @@
 // CHECK-LABEL: @hoist_const_expr_with_ineligible_consumer
 module @hoist_const_expr_with_ineligible_consumer {
   // CHECK: util.global private @[[HOISTED_0:.*]] : i32
-  // CHECK: func.func @main
-  func.func @main() -> i32 {
+  // CHECK: util.func public @main
+  util.func public @main() -> i32 {
     // CHECK-DAG: %[[LOAD_HOISTED_0:.*]] = util.global.load @[[HOISTED_0]] : i32
     // CHECK-DAG: %[[RESULT:.*]] = "iree_unregistered.var_expr"(%[[LOAD_HOISTED_0]])
-    // CHECK: return %[[RESULT]]
+    // CHECK: util.return %[[RESULT]]
     %0 = arith.constant 0 : i32
     %1 = arith.constant 1 : i32
     %2 = "iree_unregistered.const_expr"(%0, %1) : (i32, i32) -> i32
     %3 = "iree_unregistered.var_expr"(%2) : (i32) -> i32
-    return %3 : i32
+    util.return %3 : i32
   }
   // CHECK: util.initializer attributes {iree.compiler.consteval} {
   // CHECK:   %[[C0:.*]] = arith.constant 0 : i32
@@ -174,17 +174,17 @@
 // CHECK-LABEL: @hoist_non_leaf_const_expr
 module @hoist_non_leaf_const_expr {
   // CHECK: util.global private @[[HOISTED:.*]] : i32
-  // CHECK: func.func @main
-  func.func @main() -> (i32) {
+  // CHECK: util.func public @main
+  util.func public @main() -> (i32) {
     // CHECK: %[[LOAD_HOISTED:.*]] = util.global.load @[[HOISTED]] : i32
     // CHECK: %[[RESULT:.*]] = "iree_unregistered.non_leaf_const_expr"(%hoisted)
-    // CHECK: return %[[RESULT]]
+    // CHECK: util.return %[[RESULT]]
     %0 = arith.constant 0 : i32
     %1 = arith.constant 1 : i32
     %2 = "iree_unregistered.non_leaf_const_expr"(%0, %1) : (i32, i32) -> i32
     %3 = "iree_unregistered.const_expr"(%2) : (i32) -> i32
     %4 = "iree_unregistered.non_leaf_const_expr"(%3) : (i32) -> i32
-    return %4 : i32
+    util.return %4 : i32
   }
   // CHECK: util.initializer attributes {iree.compiler.consteval} {
   // CHECK:   %[[C0:.*]] = arith.constant 0 : i32
@@ -200,20 +200,20 @@
 // CHECK-LABEL: @hoist_implicit_capture
 module @hoist_implicit_capture {
   // CHECK: util.global private @[[HOISTED_SYM:.*]] : i32
-  // CHECK: func.func @main
-  func.func @main() -> (i32) {
+  // CHECK: util.func public @main
+  util.func public @main() -> (i32) {
     %0 = arith.constant 0 : i32
     %1 = arith.constant 1 : i32
     // CHECK-NOT: arith.constant
     // CHECK-NOT: iree_unregistered.const_expr
     // CHECK: %[[VAL:.*]] = util.global.load @[[HOISTED_SYM]] : i32
-    // CHECK: return %[[VAL]]
+    // CHECK: util.return %[[VAL]]
     %2 = "iree_unregistered.const_expr"(%0) ({
     ^bb0(%inner0 : i32):
       %3 = arith.addi %inner0, %1 : i32
       "iree_unregistered.yield"(%3) : (i32) -> i32
     }) : (i32) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
   // Key checks: arith.constant 1 gets pulled in to the initializer
   // and the reference is updated correctly in the custom op region.
@@ -233,24 +233,24 @@
 // CHECK-NOT: util.global
 // CHECK-NOT: util.initializer
 module @do_not_hoist_non_value_type_results {
-  func.func @main() -> (!iree_unregistered.unknown_type) {
+  util.func public @main() -> (!iree_unregistered.unknown_type) {
     %0 = arith.constant 0 : i32
     %1 = arith.constant 1 : i32
     %2 = "iree_unregistered.const_expr"(%0, %1) : (i32, i32) -> !iree_unregistered.unknown_type
-    return %2 : !iree_unregistered.unknown_type
+    util.return %2 : !iree_unregistered.unknown_type
   }
 }
 
 // -----
 
 module @do_not_hoist_uses_within_dispatches {
-  func.func @main() -> (tensor<i32>) {
+  util.func public @main() -> (tensor<i32>) {
     %cst = arith.constant dense<[2, 3]>: tensor<2xi32>
     %result = flow.dispatch.region -> (tensor<i32>) {
       %slice = tensor.extract_slice %cst[0] [1] [1] : tensor<2xi32> to tensor<i32>
       flow.return %slice : tensor<i32>
     }
-    return %result : tensor<i32>
+    util.return %result : tensor<i32>
   }
 }
 // CHECK-LABEL: @do_not_hoist_uses_within_dispatches
@@ -258,12 +258,12 @@
 //       CHECK:   %[[RESULT:.+]] = flow.dispatch.region
 //       CHECK:     %[[SLICE:.+]] = tensor.extract_slice %[[CST]]
 //       CHECK:     flow.return %[[SLICE]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @do_not_hoist_uses_within_dispatches {
-  func.func @main() -> tensor<2x2xi32> {
+  util.func public @main() -> tensor<2x2xi32> {
     %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
     %1 = arith.constant dense<[[6, 7], [8,9]]> : tensor<2x2xi32>
     %expanded = tensor.expand_shape %0[[0, 1]] : tensor<4xi32> into tensor<2x2xi32>
@@ -276,7 +276,7 @@
       } -> tensor<2x2xi32>
       flow.return %4 : tensor<2x2xi32>
     }
-    return %3 : tensor<2x2xi32>
+    util.return %3 : tensor<2x2xi32>
   }
 }
 // CHECK-LABEL: @do_not_hoist_uses_within_dispatches
@@ -286,7 +286,7 @@
 //       CHECK:     %[[ADD:.+]] = linalg.generic
 //  CHECK-SAME:     %[[EXPANDED]]
 //       CHECK:     flow.return %[[ADD]]
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
@@ -299,12 +299,12 @@
 // CHECK: util.global
 // CHECK: util.initializer
 module @hoist_no_significant_size_increase_const_expr {
-  func.func @main() -> (tensor<128xi8>) {
+  util.func public @main() -> (tensor<128xi8>) {
     %0 = arith.constant dense<0> : tensor<32xi8>
     %1 = arith.constant dense<0> : tensor<32xi8>
     %2 = "iree_unregistered.const_expr"(%0, %1)
-    : (tensor<32xi8>, tensor<32xi8>) -> tensor<128xi8>
-    return %2 : tensor<128xi8>
+        : (tensor<32xi8>, tensor<32xi8>) -> tensor<128xi8>
+    util.return %2 : tensor<128xi8>
   }
 }
 
@@ -316,12 +316,12 @@
 // CHECK-NOT: util.global
 // CHECK-NOT: util.initializer
 module @do_not_hoist_significant_size_increase_const_expr {
-  func.func @main() -> (tensor<129xi8>) {
+  util.func public @main() -> (tensor<129xi8>) {
     %0 = arith.constant dense<0> : tensor<32xi8>
     %1 = arith.constant dense<0> : tensor<32xi8>
     %2 = "iree_unregistered.const_expr"(%0, %1)
-    : (tensor<32xi8>, tensor<32xi8>) -> tensor<129xi8>
-    return %2 : tensor<129xi8>
+        : (tensor<32xi8>, tensor<32xi8>) -> tensor<129xi8>
+    util.return %2 : tensor<129xi8>
   }
 }
 
@@ -335,11 +335,11 @@
 // CHECK-NOT: util.initializer
 module @nested_program_const_expr {
   module {
-    func.func @main() -> (i32) {
+    util.func public @main() -> (i32) {
       %0 = arith.constant 0 : i32
       %1 = arith.constant 1 : i32
       %2 = "iree_unregistered.const_expr"(%0, %1) : (i32, i32) -> i32
-      return %2 : i32
+      util.return %2 : i32
     }
   }
 }
@@ -353,9 +353,9 @@
 //      CHECK: util.initializer {
 // CHECK-NEXT:   util.global.load @parameter_constant
   util.global private @parameter_constant = #stream.parameter.named<"compile"::"constant_hoisted_0"> : i32
-  func.func @main() -> (i32) {
+  util.func public @main() -> (i32) {
     %load = util.global.load @parameter_constant : i32
     %1 = "iree_unregistered.const_expr"(%load) : (i32) -> i32
-    return %1 : i32
+    util.return %1 : i32
   }
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/import_resources.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/import_resources.mlir
index e8b3b50..06360c7 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/import_resources.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/import_resources.mlir

@@ -1,89 +1,89 @@
 // RUN: iree-opt --split-input-file --iree-util-import-resources %s | FileCheck %s
 
-// CHECK-LABEL: func.func @constant_splat_i64
-func.func @constant_splat_i64() -> tensor<4xi64> {
+// CHECK-LABEL: util.func public @constant_splat_i64
+util.func public @constant_splat_i64() -> tensor<4xi64> {
   // Splats should not convert.
   // CHECK-NEXT: constant dense<123>
   %c123 = arith.constant dense<123> : tensor<4xi64>
-  return %c123 : tensor<4xi64>
+  util.return %c123 : tensor<4xi64>
 }
 
 // -----
-// CHECK-LABEL: func.func @dense_i1
-func.func @dense_i1() -> tensor<4xi1> {
+// CHECK-LABEL: util.func public @dense_i1
+util.func public @dense_i1() -> tensor<4xi1> {
   // CHECK: dense_resource<dense_elements_i1>
   %c123 = arith.constant dense<[true, false, false, true]> : tensor<4xi1>
-  return %c123 : tensor<4xi1>
+  util.return %c123 : tensor<4xi1>
 }
 
 // CHECK: dense_elements_i1: "0x4000000001000001"
 
 // -----
-// CHECK-LABEL: func.func @dense_i8
-func.func @dense_i8() -> tensor<4xi8> {
+// CHECK-LABEL: util.func public @dense_i8
+util.func public @dense_i8() -> tensor<4xi8> {
   // CHECK: dense_resource<dense_elements_i8>
   %c123 = arith.constant dense<[1, 2, 3, 127]> : tensor<4xi8>
-  return %c123 : tensor<4xi8>
+  util.return %c123 : tensor<4xi8>
 }
 
 // CHECK: dense_elements_i8: "0x400000000102037F"
 
 // -----
-// CHECK-LABEL: func.func @dense_i16
-func.func @dense_i16() -> tensor<4xi16> {
+// CHECK-LABEL: util.func public @dense_i16
+util.func public @dense_i16() -> tensor<4xi16> {
   // CHECK: dense_resource<dense_elements_i16>
   %c123 = arith.constant dense<[1, 2, 3, 127]> : tensor<4xi16>
-  return %c123 : tensor<4xi16>
+  util.return %c123 : tensor<4xi16>
 }
 
 // CHECK: dense_elements_i16: "0x400000000100020003007F00"
 
 // -----
-// CHECK-LABEL: func.func @dense_i32
-func.func @dense_i32() -> tensor<4xi32> {
+// CHECK-LABEL: util.func public @dense_i32
+util.func public @dense_i32() -> tensor<4xi32> {
   // CHECK: dense_resource<dense_elements_i32>
   %c123 = arith.constant dense<[1, 2, 3, 127]> : tensor<4xi32>
-  return %c123 : tensor<4xi32>
+  util.return %c123 : tensor<4xi32>
 }
 
 // CHECK: dense_elements_i32: "0x400000000100000002000000030000007F000000"
 
 // -----
-// CHECK-LABEL: func.func @dense_i64
-func.func @dense_i64() -> tensor<4xi64> {
+// CHECK-LABEL: util.func public @dense_i64
+util.func public @dense_i64() -> tensor<4xi64> {
   // CHECK: dense_resource<dense_elements_i64>
   %c123 = arith.constant dense<[1, 2, 3, 127]> : tensor<4xi64>
-  return %c123 : tensor<4xi64>
+  util.return %c123 : tensor<4xi64>
 }
 
 // CHECK: dense_elements_i64: "0x400000000100000000000000020000000000000003000000000000007F00000000000000"
 
 // -----
-// CHECK-LABEL: func.func @dense_f16
-func.func @dense_f16() -> tensor<4xf16> {
+// CHECK-LABEL: util.func public @dense_f16
+util.func public @dense_f16() -> tensor<4xf16> {
   // CHECK: dense_resource<dense_elements_f16>
   %c123 = arith.constant dense<[1.1, 2.2, 3.3, 0.0]> : tensor<4xf16>
-  return %c123 : tensor<4xf16>
+  util.return %c123 : tensor<4xf16>
 }
 
 // CHECK: dense_elements_f16: "0x40000000663C66409A420000"
 
 // -----
-// CHECK-LABEL: func.func @dense_f32
-func.func @dense_f32() -> tensor<4xf32> {
+// CHECK-LABEL: util.func public @dense_f32
+util.func public @dense_f32() -> tensor<4xf32> {
   // CHECK: dense_resource<dense_elements_f32>
   %c123 = arith.constant dense<[1.1, 2.2, 3.3, 0.0]> : tensor<4xf32>
-  return %c123 : tensor<4xf32>
+  util.return %c123 : tensor<4xf32>
 }
 
 // CHECK: dense_elements_f32: "0x40000000CDCC8C3FCDCC0C403333534000000000"
 
 // -----
-// CHECK-LABEL: func.func @dense_f64
-func.func @dense_f64() -> tensor<4xf64> {
+// CHECK-LABEL: util.func public @dense_f64
+util.func public @dense_f64() -> tensor<4xf64> {
   // CHECK: dense_resource<dense_elements_f64>
   %c123 = arith.constant dense<[1.1, 2.2, 3.3, 0.0]> : tensor<4xf64>
-  return %c123 : tensor<4xf64>
+  util.return %c123 : tensor<4xf64>
 }
 
 // CHECK: dense_elements_f64: "0x400000009A9999999999F13F9A999999999901406666666666660A400000000000000000"

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/ipo.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/ipo.mlir
index 631dc40..27ead2b 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/ipo.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/ipo.mlir

@@ -2,99 +2,99 @@
 
 // Tests that unused args get dropped.
 
-// CHECK-LABEL: func.func private @unused_arg_callee
+// CHECK-LABEL: util.func private @unused_arg_callee
 // CHECK-SAME: (%[[ARG1:.+]]: index) -> index
-func.func private @unused_arg_callee(%arg0: index, %arg1: index) -> index {
+util.func private @unused_arg_callee(%arg0: index, %arg1: index) -> index {
   // CHECK: %[[ADD:.+]] = arith.addi %[[ARG1]], %[[ARG1]]
   %add = arith.addi %arg1, %arg1 : index
-  // CHECK: return %[[ADD]]
-  return %add : index
+  // CHECK: util.return %[[ADD]]
+  util.return %add : index
 }
 
-// CHECK: func.func @unused_arg_caller_a(%[[A_ARG0:.+]]: index, %[[A_ARG1:.+]]: index)
-func.func @unused_arg_caller_a(%arg0: index, %arg1: index) -> (index, index) {
-  // CHECK: %[[A_RET0:.+]] = call @unused_arg_callee(%[[A_ARG0]]) : (index) -> index
-  %ret0 = call @unused_arg_callee(%arg0, %arg0) : (index, index) -> index
-  // CHECK: %[[A_RET1:.+]] = call @unused_arg_callee(%[[A_ARG1]]) : (index) -> index
-  %ret1 = call @unused_arg_callee(%arg0, %arg1) : (index, index) -> index
-  // CHECK: return %[[A_RET0]], %[[A_RET1]]
-  return %ret0, %ret1 : index, index
+// CHECK: util.func public @unused_arg_caller_a(%[[A_ARG0:.+]]: index, %[[A_ARG1:.+]]: index)
+util.func public @unused_arg_caller_a(%arg0: index, %arg1: index) -> (index, index) {
+  // CHECK: %[[A_RET0:.+]] = util.call @unused_arg_callee(%[[A_ARG0]]) : (index) -> index
+  %ret0 = util.call @unused_arg_callee(%arg0, %arg0) : (index, index) -> index
+  // CHECK: %[[A_RET1:.+]] = util.call @unused_arg_callee(%[[A_ARG1]]) : (index) -> index
+  %ret1 = util.call @unused_arg_callee(%arg0, %arg1) : (index, index) -> index
+  // CHECK: util.return %[[A_RET0]], %[[A_RET1]]
+  util.return %ret0, %ret1 : index, index
 }
-// CHECK: func.func @unused_arg_caller_b(%[[B_ARG0:.+]]: index)
-func.func @unused_arg_caller_b(%arg0: index) -> index {
-  // CHECK: %[[B_RET0:.+]] = call @unused_arg_callee(%[[B_ARG0]]) : (index) -> index
-  %ret0 = call @unused_arg_callee(%arg0, %arg0) : (index, index) -> index
-  // CHECK: return %[[B_RET0]]
-  return %ret0 : index
+// CHECK: util.func public @unused_arg_caller_b(%[[B_ARG0:.+]]: index)
+util.func public @unused_arg_caller_b(%arg0: index) -> index {
+  // CHECK: %[[B_RET0:.+]] = util.call @unused_arg_callee(%[[B_ARG0]]) : (index) -> index
+  %ret0 = util.call @unused_arg_callee(%arg0, %arg0) : (index, index) -> index
+  // CHECK: util.return %[[B_RET0]]
+  util.return %ret0 : index
 }
 
 // -----
 
 // Tests that uniformly unused results get dropped.
 
-// CHECK-LABEL: func.func private @unused_result_callee
+// CHECK-LABEL: util.func private @unused_result_callee
 // CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: index) -> index
-func.func private @unused_result_callee(%arg0: index, %arg1: index) -> (index, index) {
+util.func private @unused_result_callee(%arg0: index, %arg1: index) -> (index, index) {
   // CHECK: %[[ADD0:.+]] = arith.addi %[[ARG0]], %[[ARG1]]
   %add0 = arith.addi %arg0, %arg1 : index
   // CHECK: %[[ADD1:.+]] = arith.addi %[[ADD0]], %[[ARG0]]
   %add1 = arith.addi %add0, %arg0 : index
-  // CHECK: return %[[ADD1]]
-  return %add0, %add1 : index, index
+  // CHECK: util.return %[[ADD1]]
+  util.return %add0, %add1 : index, index
 }
 
-// CHECK: func.func @unused_result_caller_a(%[[A_ARG0:.+]]: index, %[[A_ARG1:.+]]: index)
-func.func @unused_result_caller_a(%arg0: index, %arg1: index) -> index {
-  // CHECK: %[[A_RET1:.+]] = call @unused_result_callee(%[[A_ARG0]], %[[A_ARG1]]) : (index, index) -> index
-  %ret:2 = call @unused_result_callee(%arg0, %arg1) : (index, index) -> (index, index)
-  // CHECK: return %[[A_RET1]]
-  return %ret#1 : index
+// CHECK: util.func public @unused_result_caller_a(%[[A_ARG0:.+]]: index, %[[A_ARG1:.+]]: index)
+util.func public @unused_result_caller_a(%arg0: index, %arg1: index) -> index {
+  // CHECK: %[[A_RET1:.+]] = util.call @unused_result_callee(%[[A_ARG0]], %[[A_ARG1]]) : (index, index) -> index
+  %ret:2 = util.call @unused_result_callee(%arg0, %arg1) : (index, index) -> (index, index)
+  // CHECK: util.return %[[A_RET1]]
+  util.return %ret#1 : index
 }
-// CHECK: func.func @unused_result_caller_b(%[[B_ARG0:.+]]: index, %[[B_ARG1:.+]]: index)
-func.func @unused_result_caller_b(%arg0: index, %arg1: index) -> index {
-  // CHECK: %[[B_RET1:.+]] = call @unused_result_callee(%[[B_ARG0]], %[[B_ARG1]]) : (index, index) -> index
-  %ret:2 = call @unused_result_callee(%arg0, %arg1) : (index, index) -> (index, index)
-  // CHECK: return %[[B_RET1]]
-  return %ret#1 : index
+// CHECK: util.func public @unused_result_caller_b(%[[B_ARG0:.+]]: index, %[[B_ARG1:.+]]: index)
+util.func public @unused_result_caller_b(%arg0: index, %arg1: index) -> index {
+  // CHECK: %[[B_RET1:.+]] = util.call @unused_result_callee(%[[B_ARG0]], %[[B_ARG1]]) : (index, index) -> index
+  %ret:2 = util.call @unused_result_callee(%arg0, %arg1) : (index, index) -> (index, index)
+  // CHECK: util.return %[[B_RET1]]
+  util.return %ret#1 : index
 }
-// CHECK: func.func @unused_result_caller_c(%[[C_ARG0:.+]]: index, %[[C_ARG1:.+]]: index)
-func.func @unused_result_caller_c(%arg0: index, %arg1: index) {
-  // CHECK: %[[C_RET1:.+]] = call @unused_result_callee(%[[C_ARG0]], %[[C_ARG1]]) : (index, index) -> index
-  %ret:2 = call @unused_result_callee(%arg0, %arg1) : (index, index) -> (index, index)
-  // CHECK: return
-  return
+// CHECK: util.func public @unused_result_caller_c(%[[C_ARG0:.+]]: index, %[[C_ARG1:.+]]: index)
+util.func public @unused_result_caller_c(%arg0: index, %arg1: index) {
+  // CHECK: %[[C_RET1:.+]] = util.call @unused_result_callee(%[[C_ARG0]], %[[C_ARG1]]) : (index, index) -> index
+  %ret:2 = util.call @unused_result_callee(%arg0, %arg1) : (index, index) -> (index, index)
+  // CHECK: util.return
+  util.return
 }
 
 // -----
 
 // Tests that uniformly duplicate args get combined.
 
-// CHECK-LABEL: func.func private @dupe_arg_callee
+// CHECK-LABEL: util.func private @dupe_arg_callee
 // CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: index) -> index
-func.func private @dupe_arg_callee(%arg0: index, %arg1: index, %arg0_dupe: index) -> index {
+util.func private @dupe_arg_callee(%arg0: index, %arg1: index, %arg0_dupe: index) -> index {
   // CHECK: %[[ADD0:.+]] = arith.addi %[[ARG0]], %[[ARG1]]
   %add0 = arith.addi %arg0, %arg1 : index
   // CHECK: %[[ADD1:.+]] = arith.addi %[[ADD0]], %[[ARG0]]
   %add1 = arith.addi %add0, %arg0_dupe : index
-  // CHECK: return %[[ADD1]]
-  return %add1 : index
+  // CHECK: util.return %[[ADD1]]
+  util.return %add1 : index
 }
 
-// CHECK: func.func @dupe_arg_caller_a(%[[A_ARG0:.+]]: index, %[[A_ARG1:.+]]: index)
-func.func @dupe_arg_caller_a(%arg0: index, %arg1: index) -> (index, index) {
-  // CHECK: %[[A_RET0:.+]] = call @dupe_arg_callee(%[[A_ARG0]], %[[A_ARG0]]) : (index, index) -> index
-  %ret0 = call @dupe_arg_callee(%arg0, %arg0, %arg0) : (index, index, index) -> index
-  // CHECK: %[[A_RET1:.+]] = call @dupe_arg_callee(%[[A_ARG0]], %[[A_ARG1]]) : (index, index) -> index
-  %ret1 = call @dupe_arg_callee(%arg0, %arg1, %arg0) : (index, index, index) -> index
-  // CHECK: return %[[A_RET0]], %[[A_RET1]]
-  return %ret0, %ret1 : index, index
+// CHECK: util.func public @dupe_arg_caller_a(%[[A_ARG0:.+]]: index, %[[A_ARG1:.+]]: index)
+util.func public @dupe_arg_caller_a(%arg0: index, %arg1: index) -> (index, index) {
+  // CHECK: %[[A_RET0:.+]] = util.call @dupe_arg_callee(%[[A_ARG0]], %[[A_ARG0]]) : (index, index) -> index
+  %ret0 = util.call @dupe_arg_callee(%arg0, %arg0, %arg0) : (index, index, index) -> index
+  // CHECK: %[[A_RET1:.+]] = util.call @dupe_arg_callee(%[[A_ARG0]], %[[A_ARG1]]) : (index, index) -> index
+  %ret1 = util.call @dupe_arg_callee(%arg0, %arg1, %arg0) : (index, index, index) -> index
+  // CHECK: util.return %[[A_RET0]], %[[A_RET1]]
+  util.return %ret0, %ret1 : index, index
 }
-// CHECK: func.func @dupe_arg_caller_b(%[[B_ARG0:.+]]: index)
-func.func @dupe_arg_caller_b(%arg0: index) -> index {
-  // CHECK: %[[B_RET0:.+]] = call @dupe_arg_callee(%[[B_ARG0]], %[[B_ARG0]]) : (index, index) -> index
-  %ret0 = call @dupe_arg_callee(%arg0, %arg0, %arg0) : (index, index, index) -> index
-  // CHECK: return %[[B_RET0]]
-  return %ret0 : index
+// CHECK: util.func public @dupe_arg_caller_b(%[[B_ARG0:.+]]: index)
+util.func public @dupe_arg_caller_b(%arg0: index) -> index {
+  // CHECK: %[[B_RET0:.+]] = util.call @dupe_arg_callee(%[[B_ARG0]], %[[B_ARG0]]) : (index, index) -> index
+  %ret0 = util.call @dupe_arg_callee(%arg0, %arg0, %arg0) : (index, index, index) -> index
+  // CHECK: util.return %[[B_RET0]]
+  util.return %ret0 : index
 }
 
 // -----
@@ -103,203 +103,203 @@
 // that base argument stays live. Note that %arg0 is not used in the callee
 // but a duplicate of it is.
 
-// CHECK-LABEL: func.func private @dupe_unused_arg_callee
+// CHECK-LABEL: util.func private @dupe_unused_arg_callee
 // CHECK-SAME: (%[[CALLEE_ARG0:.+]]: index) -> index
-func.func private @dupe_unused_arg_callee(%arg0: index, %arg0_dupe: index) -> (index, index) {
+util.func private @dupe_unused_arg_callee(%arg0: index, %arg0_dupe: index) -> (index, index) {
   // CHECK: %[[CALLEE_RET0:.+]] = arith.addi %[[CALLEE_ARG0]], %[[CALLEE_ARG0]]
   %ret0 = arith.addi %arg0_dupe, %arg0_dupe : index
-  // CHECK: return %[[CALLEE_RET0]]
-  return %ret0, %arg0 : index, index
+  // CHECK: util.return %[[CALLEE_RET0]]
+  util.return %ret0, %arg0 : index, index
 }
 
-// CHECK: func.func @dupe_unused_arg_caller(%[[CALLER_ARG0:.+]]: index)
-func.func @dupe_unused_arg_caller(%arg0: index) -> (index, index) {
-  // CHECK: %[[CALLER_RET0:.+]] = call @dupe_unused_arg_callee(%[[CALLER_ARG0]]) : (index) -> index
-  %ret:2 = call @dupe_unused_arg_callee(%arg0, %arg0) : (index, index) -> (index, index)
-  // CHECK: return %[[CALLER_RET0]], %[[CALLER_ARG0]]
-  return %ret#0, %ret#1 : index, index
+// CHECK: util.func public @dupe_unused_arg_caller(%[[CALLER_ARG0:.+]]: index)
+util.func public @dupe_unused_arg_caller(%arg0: index) -> (index, index) {
+  // CHECK: %[[CALLER_RET0:.+]] = util.call @dupe_unused_arg_callee(%[[CALLER_ARG0]]) : (index) -> index
+  %ret:2 = util.call @dupe_unused_arg_callee(%arg0, %arg0) : (index, index) -> (index, index)
+  // CHECK: util.return %[[CALLER_RET0]], %[[CALLER_ARG0]]
+  util.return %ret#0, %ret#1 : index, index
 }
 
 // -----
 
 // Tests that uniformly duplicate results get combined.
 
-// CHECK-LABEL: func.func private @dupe_result_callee
+// CHECK-LABEL: util.func private @dupe_result_callee
 // CHECK-SAME: (%[[ARG0:.+]]: i1, %[[ARG1:.+]]: index) -> (index, index)
-func.func private @dupe_result_callee(%arg0: i1, %arg1: index) -> (index, index, index) {
+util.func private @dupe_result_callee(%arg0: i1, %arg1: index) -> (index, index, index) {
   // CHECK: %[[ADD0:.+]] = arith.addi %[[ARG1]], %[[ARG1]]
   %add0 = arith.addi %arg1, %arg1 : index
   // CHECK: %[[ADD1:.+]] = arith.addi %[[ADD0]], %[[ARG1]]
   %add1 = arith.addi %add0, %arg1 : index
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb1:
-  // CHECK: return %[[ADD0]], %[[ADD0]]
-  return %add0, %add0, %add0 : index, index, index
+  // CHECK: util.return %[[ADD0]], %[[ADD0]]
+  util.return %add0, %add0, %add0 : index, index, index
 ^bb2:
-  // CHECK: return %[[ADD0]], %[[ADD1]]
-  return %add0, %add1, %add0 : index, index, index
+  // CHECK: util.return %[[ADD0]], %[[ADD1]]
+  util.return %add0, %add1, %add0 : index, index, index
 }
 
-// CHECK: func.func @dupe_result_caller(%[[ARG0:.+]]: i1, %[[ARG1:.+]]: index)
-func.func @dupe_result_caller(%arg0: i1, %arg1: index) -> (index, index, index) {
-  // CHECK: %[[RET:.+]]:2 = call @dupe_result_callee(%[[ARG0]], %[[ARG1]]) : (i1, index) -> (index, index)
-  %ret:3 = call @dupe_result_callee(%arg0, %arg1) : (i1, index) -> (index, index, index)
-  // CHECK: return %[[RET]]#0, %[[RET]]#1, %[[RET]]#0
-  return %ret#0, %ret#1, %ret#2 : index, index, index
+// CHECK: util.func public @dupe_result_caller(%[[ARG0:.+]]: i1, %[[ARG1:.+]]: index)
+util.func public @dupe_result_caller(%arg0: i1, %arg1: index) -> (index, index, index) {
+  // CHECK: %[[RET:.+]]:2 = util.call @dupe_result_callee(%[[ARG0]], %[[ARG1]]) : (i1, index) -> (index, index)
+  %ret:3 = util.call @dupe_result_callee(%arg0, %arg1) : (i1, index) -> (index, index, index)
+  // CHECK: util.return %[[RET]]#0, %[[RET]]#1, %[[RET]]#0
+  util.return %ret#0, %ret#1, %ret#2 : index, index, index
 }
 
 // -----
 
 // Tests that uniformly constant args get inlined into callees.
 
-// CHECK-LABEL: func.func private @uniform_arg_callee
+// CHECK-LABEL: util.func private @uniform_arg_callee
 // CHECK-SAME: () -> index
-func.func private @uniform_arg_callee(%arg0: index) -> index {
+util.func private @uniform_arg_callee(%arg0: index) -> index {
   // CHECK: %[[C1:.+]] = arith.constant 1
   // CHECK: %[[ADD:.+]] = arith.addi %[[C1]], %[[C1]]
   %add = arith.addi %arg0, %arg0 : index
-  // CHECK: return %[[ADD]]
-  return %add : index
+  // CHECK: util.return %[[ADD]]
+  util.return %add : index
 }
 
-// CHECK: func.func @uniform_arg_caller_a
-func.func @uniform_arg_caller_a() -> (index, index) {
+// CHECK: util.func public @uniform_arg_caller_a
+util.func public @uniform_arg_caller_a() -> (index, index) {
   %c1 = arith.constant 1 : index
-  // CHECK: %[[A_RET0:.+]] = call @uniform_arg_callee() : () -> index
-  %ret0 = call @uniform_arg_callee(%c1) : (index) -> index
-  // CHECK: %[[A_RET1:.+]] = call @uniform_arg_callee() : () -> index
-  %ret1 = call @uniform_arg_callee(%c1) : (index) -> index
-  // CHECK: return %[[A_RET0]], %[[A_RET1]]
-  return %ret0, %ret1 : index, index
+  // CHECK: %[[A_RET0:.+]] = util.call @uniform_arg_callee() : () -> index
+  %ret0 = util.call @uniform_arg_callee(%c1) : (index) -> index
+  // CHECK: %[[A_RET1:.+]] = util.call @uniform_arg_callee() : () -> index
+  %ret1 = util.call @uniform_arg_callee(%c1) : (index) -> index
+  // CHECK: util.return %[[A_RET0]], %[[A_RET1]]
+  util.return %ret0, %ret1 : index, index
 }
-// CHECK: func.func @uniform_arg_caller_b
-func.func @uniform_arg_caller_b() -> index {
+// CHECK: util.func public @uniform_arg_caller_b
+util.func public @uniform_arg_caller_b() -> index {
   %c1 = arith.constant 1 : index
-  // CHECK: %[[B_RET0:.+]] = call @uniform_arg_callee() : () -> index
-  %ret0 = call @uniform_arg_callee(%c1) : (index) -> index
-  // CHECK: return %[[B_RET0]]
-  return %ret0 : index
+  // CHECK: %[[B_RET0:.+]] = util.call @uniform_arg_callee() : () -> index
+  %ret0 = util.call @uniform_arg_callee(%c1) : (index) -> index
+  // CHECK: util.return %[[B_RET0]]
+  util.return %ret0 : index
 }
 
 // -----
 
 // Tests that uniformly constant results get inlined into callers.
 
-// CHECK-LABEL: func.func private @uniform_result_callee
+// CHECK-LABEL: util.func private @uniform_result_callee
 // CHECK-SAME: (%[[ARG0:.+]]: i1)
-func.func private @uniform_result_callee(%arg0: i1) -> index {
+util.func private @uniform_result_callee(%arg0: i1) -> index {
   %c0 = arith.constant 0 : index
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb1:
-  // CHECK: return
-  return %c0 : index
+  // CHECK: util.return
+  util.return %c0 : index
 ^bb2:
-  // CHECK: return
-  return %c0 : index
+  // CHECK: util.return
+  util.return %c0 : index
 }
 
-// CHECK: func.func @uniform_result_caller(%[[ARG0:.+]]: i1)
-func.func @uniform_result_caller(%arg0: i1) -> index {
+// CHECK: util.func public @uniform_result_caller(%[[ARG0:.+]]: i1)
+util.func public @uniform_result_caller(%arg0: i1) -> index {
   // CHECK: call @uniform_result_callee(%[[ARG0]]) : (i1) -> ()
-  %ret0 = call @uniform_result_callee(%arg0) : (i1) -> index
+  %ret0 = util.call @uniform_result_callee(%arg0) : (i1) -> index
   // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: return %[[C0]]
-  return %ret0 : index
+  // CHECK: util.return %[[C0]]
+  util.return %ret0 : index
 }
 
 // -----
 
 // Tests that uniformly duplicate constant results get combined/inlined.
 
-// CHECK-LABEL: func.func private @dupe_constant_result_callee
+// CHECK-LABEL: util.func private @dupe_constant_result_callee
 // CHECK-SAME: (%[[ARG0:.+]]: i1) -> index
-func.func private @dupe_constant_result_callee(%arg0: i1) -> (index, index, index) {
+util.func private @dupe_constant_result_callee(%arg0: i1) -> (index, index, index) {
   // CHECK: %[[C0:.+]] = arith.constant 0
   %c0 = arith.constant 0 : index
   // CHECK: %[[C1:.+]] = arith.constant 1
   %c1 = arith.constant 1 : index
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb1:
-  // CHECK: return %[[C0]]
-  return %c0, %c0, %c0 : index, index, index
+  // CHECK: util.return %[[C0]]
+  util.return %c0, %c0, %c0 : index, index, index
 ^bb2:
-  // CHECK: return %[[C1]]
-  return %c0, %c1, %c0 : index, index, index
+  // CHECK: util.return %[[C1]]
+  util.return %c0, %c1, %c0 : index, index, index
 }
 
-// CHECK: func.func @dupe_constant_result_caller(%[[ARG0:.+]]: i1)
-func.func @dupe_constant_result_caller(%arg0: i1) -> (index, index, index) {
-  // CHECK: %[[RET:.+]] = call @dupe_constant_result_callee(%[[ARG0]]) : (i1) -> index
-  %ret:3 = call @dupe_constant_result_callee(%arg0) : (i1) -> (index, index, index)
+// CHECK: util.func public @dupe_constant_result_caller(%[[ARG0:.+]]: i1)
+util.func public @dupe_constant_result_caller(%arg0: i1) -> (index, index, index) {
+  // CHECK: %[[RET:.+]] = util.call @dupe_constant_result_callee(%[[ARG0]]) : (i1) -> index
+  %ret:3 = util.call @dupe_constant_result_callee(%arg0) : (i1) -> (index, index, index)
   // CHECK: %[[C0_INLINE:.+]] = arith.constant 0
   // CHECK-NEXT: %[[C0_INLINE_DUPE:.+]] = arith.constant 0
-  // CHECK: return %[[C0_INLINE]], %[[RET]], %[[C0_INLINE_DUPE]]
-  return %ret#0, %ret#1, %ret#2 : index, index, index
+  // CHECK: util.return %[[C0_INLINE]], %[[RET]], %[[C0_INLINE_DUPE]]
+  util.return %ret#0, %ret#1, %ret#2 : index, index, index
 }
 
 // -----
 
 // Tests that public functions are unmodified (the unused arg is not dropped).
 
-// CHECK-LABEL: func.func public @public_unused_arg
+// CHECK-LABEL: util.func public @public_unused_arg
 // CHECK-SAME: (%[[ARG0:.+]]: index)
-func.func public @public_unused_arg(%arg0: index) {
-  return
+util.func public @public_unused_arg(%arg0: index) {
+  util.return
 }
 
 // -----
 
 // Tests that non-uniform call args don't get optimized.
 
-// CHECK-LABEL: func.func private @nonuniform_arg_callee
+// CHECK-LABEL: util.func private @nonuniform_arg_callee
 // CHECK-SAME: (%[[ARG0:.+]]: index) -> index
-func.func private @nonuniform_arg_callee(%arg0: index) -> index {
+util.func private @nonuniform_arg_callee(%arg0: index) -> index {
   // CHECK: %[[ADD:.+]] = arith.addi %[[ARG0]], %[[ARG0]]
   %add = arith.addi %arg0, %arg0 : index
-  // CHECK: return %[[ADD]]
-  return %add : index
+  // CHECK: util.return %[[ADD]]
+  util.return %add : index
 }
 
-// CHECK: func.func @nonuniform_arg_caller_a(%[[A_ARG0:.+]]: index)
-func.func @nonuniform_arg_caller_a(%arg0: index) -> (index, index) {
-  // CHECK: %[[A_RET0:.+]] = call @nonuniform_arg_callee(%[[A_ARG0]]) : (index) -> index
-  %ret0 = call @nonuniform_arg_callee(%arg0) : (index) -> index
-  // CHECK: %[[A_RET1:.+]] = call @nonuniform_arg_callee(%[[A_ARG0]]) : (index) -> index
-  %ret1 = call @nonuniform_arg_callee(%arg0) : (index) -> index
-  // CHECK: return %[[A_RET0]], %[[A_RET1]]
-  return %ret0, %ret1 : index, index
+// CHECK: util.func public @nonuniform_arg_caller_a(%[[A_ARG0:.+]]: index)
+util.func public @nonuniform_arg_caller_a(%arg0: index) -> (index, index) {
+  // CHECK: %[[A_RET0:.+]] = util.call @nonuniform_arg_callee(%[[A_ARG0]]) : (index) -> index
+  %ret0 = util.call @nonuniform_arg_callee(%arg0) : (index) -> index
+  // CHECK: %[[A_RET1:.+]] = util.call @nonuniform_arg_callee(%[[A_ARG0]]) : (index) -> index
+  %ret1 = util.call @nonuniform_arg_callee(%arg0) : (index) -> index
+  // CHECK: util.return %[[A_RET0]], %[[A_RET1]]
+  util.return %ret0, %ret1 : index, index
 }
-// CHECK: func.func @nonuniform_arg_caller_b(%[[B_ARG0:.+]]: index)
-func.func @nonuniform_arg_caller_b(%arg0: index) -> index {
-  // CHECK: %[[B_RET0:.+]] = call @nonuniform_arg_callee(%[[B_ARG0]]) : (index) -> index
-  %ret0 = call @nonuniform_arg_callee(%arg0) : (index) -> index
-  // CHECK: return %[[B_RET0]]
-  return %ret0 : index
+// CHECK: util.func public @nonuniform_arg_caller_b(%[[B_ARG0:.+]]: index)
+util.func public @nonuniform_arg_caller_b(%arg0: index) -> index {
+  // CHECK: %[[B_RET0:.+]] = util.call @nonuniform_arg_callee(%[[B_ARG0]]) : (index) -> index
+  %ret0 = util.call @nonuniform_arg_callee(%arg0) : (index) -> index
+  // CHECK: util.return %[[B_RET0]]
+  util.return %ret0 : index
 }
 
 // -----
 
 // Tests that non-uniform call args w/ constants don't get optimized.
 
-// CHECK-LABEL: func.func private @nonuniform_constant_arg_callee
+// CHECK-LABEL: util.func private @nonuniform_constant_arg_callee
 // CHECK-SAME: (%[[ARG0:.+]]: index) -> index
-func.func private @nonuniform_constant_arg_callee(%arg0: index) -> index {
+util.func private @nonuniform_constant_arg_callee(%arg0: index) -> index {
   // CHECK: %[[ADD:.+]] = arith.addi %[[ARG0]], %[[ARG0]]
   %add = arith.addi %arg0, %arg0 : index
-  // CHECK: return %[[ADD]]
-  return %add : index
+  // CHECK: util.return %[[ADD]]
+  util.return %add : index
 }
 
-// CHECK: func.func @nonuniform_arg_caller(%[[CALLER_ARG0:.+]]: index)
-func.func @nonuniform_arg_caller(%arg0: index) -> (index, index) {
+// CHECK: util.func public @nonuniform_arg_caller(%[[CALLER_ARG0:.+]]: index)
+util.func public @nonuniform_arg_caller(%arg0: index) -> (index, index) {
   // CHECK-DAG: %[[C10:.+]] = arith.constant 10
   %c10 = arith.constant 10 : index
-  // CHECK: %[[RET0:.+]] = call @nonuniform_constant_arg_callee(%[[CALLER_ARG0]]) : (index) -> index
-  %ret0 = call @nonuniform_constant_arg_callee(%arg0) : (index) -> index
-  // CHECK: %[[RET1:.+]] = call @nonuniform_constant_arg_callee(%[[C10]]) : (index) -> index
-  %ret1 = call @nonuniform_constant_arg_callee(%c10) : (index) -> index
-  // CHECK: return %[[RET0]], %[[RET1]]
-  return %ret0, %ret1 : index, index
+  // CHECK: %[[RET0:.+]] = util.call @nonuniform_constant_arg_callee(%[[CALLER_ARG0]]) : (index) -> index
+  %ret0 = util.call @nonuniform_constant_arg_callee(%arg0) : (index) -> index
+  // CHECK: %[[RET1:.+]] = util.call @nonuniform_constant_arg_callee(%[[C10]]) : (index) -> index
+  %ret1 = util.call @nonuniform_constant_arg_callee(%c10) : (index) -> index
+  // CHECK: util.return %[[RET0]], %[[RET1]]
+  util.return %ret0, %ret1 : index, index
 }
 
 // -----
@@ -307,53 +307,104 @@
 // Tests that non-uniform call args w/ constants don't get optimized (order
 // flipped from above).
 
-// CHECK-LABEL: func.func private @nonuniform_constant_arg_callee_flipped
+// CHECK-LABEL: util.func private @nonuniform_constant_arg_callee_flipped
 // CHECK-SAME: (%[[ARG0:.+]]: index) -> index
-func.func private @nonuniform_constant_arg_callee_flipped(%arg0: index) -> index {
+util.func private @nonuniform_constant_arg_callee_flipped(%arg0: index) -> index {
   // CHECK: %[[ADD:.+]] = arith.addi %[[ARG0]], %[[ARG0]]
   %add = arith.addi %arg0, %arg0 : index
-  // CHECK: return %[[ADD]]
-  return %add : index
+  // CHECK: util.return %[[ADD]]
+  util.return %add : index
 }
 
-// CHECK: func.func @nonuniform_arg_caller_flipped(%[[CALLER_ARG0:.+]]: index)
-func.func @nonuniform_arg_caller_flipped(%arg0: index) -> (index, index) {
+// CHECK: util.func public @nonuniform_arg_caller_flipped(%[[CALLER_ARG0:.+]]: index)
+util.func public @nonuniform_arg_caller_flipped(%arg0: index) -> (index, index) {
   // CHECK-DAG: %[[C10:.+]] = arith.constant 10
   %c10 = arith.constant 10 : index
-  // CHECK: %[[RET0:.+]] = call @nonuniform_constant_arg_callee_flipped(%[[C10]]) : (index) -> index
-  %ret0 = call @nonuniform_constant_arg_callee_flipped(%c10) : (index) -> index
-  // CHECK: %[[RET1:.+]] = call @nonuniform_constant_arg_callee_flipped(%[[CALLER_ARG0]]) : (index) -> index
-  %ret1 = call @nonuniform_constant_arg_callee_flipped(%arg0) : (index) -> index
-  // CHECK: return %[[RET0]], %[[RET1]]
-  return %ret0, %ret1 : index, index
+  // CHECK: %[[RET0:.+]] = util.call @nonuniform_constant_arg_callee_flipped(%[[C10]]) : (index) -> index
+  %ret0 = util.call @nonuniform_constant_arg_callee_flipped(%c10) : (index) -> index
+  // CHECK: %[[RET1:.+]] = util.call @nonuniform_constant_arg_callee_flipped(%[[CALLER_ARG0]]) : (index) -> index
+  %ret1 = util.call @nonuniform_constant_arg_callee_flipped(%arg0) : (index) -> index
+  // CHECK: util.return %[[RET0]], %[[RET1]]
+  util.return %ret0, %ret1 : index, index
+}
+
+// -----
+
+// Tests that non-uniform call args w/ constants don't get optimized.
+
+// CHECK-LABEL: util.func private @nonuniform_constant_arg_callee
+// CHECK-SAME: (%[[ARG0:.+]]: index) -> index
+util.func private @nonuniform_constant_arg_callee(%arg0: index) -> index {
+  // CHECK: %[[ADD:.+]] = arith.addi %[[ARG0]], %[[ARG0]]
+  %add = arith.addi %arg0, %arg0 : index
+  // CHECK: util.return %[[ADD]]
+  util.return %add : index
+}
+
+// CHECK: util.func public @nonuniform_arg_caller(%[[CALLER_ARG0:.+]]: index)
+util.func public @nonuniform_arg_caller(%arg0: index) -> (index, index) {
+  // CHECK-DAG: %[[C10:.+]] = arith.constant 10
+  %c10 = arith.constant 10 : index
+  // CHECK: %[[RET0:.+]] = util.call @nonuniform_constant_arg_callee(%[[CALLER_ARG0]]) : (index) -> index
+  %ret0 = util.call @nonuniform_constant_arg_callee(%arg0) : (index) -> index
+  // CHECK: %[[RET1:.+]] = util.call @nonuniform_constant_arg_callee(%[[C10]]) : (index) -> index
+  %ret1 = util.call @nonuniform_constant_arg_callee(%c10) : (index) -> index
+  // CHECK: util.return %[[RET0]], %[[RET1]]
+  util.return %ret0, %ret1 : index, index
+}
+
+// -----
+
+// Tests that non-uniform call args w/ constants don't get optimized (order
+// flipped from above).
+
+// CHECK-LABEL: util.func private @nonuniform_constant_arg_callee_flipped
+// CHECK-SAME: (%[[ARG0:.+]]: index) -> index
+util.func private @nonuniform_constant_arg_callee_flipped(%arg0: index) -> index {
+  // CHECK: %[[ADD:.+]] = arith.addi %[[ARG0]], %[[ARG0]]
+  %add = arith.addi %arg0, %arg0 : index
+  // CHECK: util.return %[[ADD]]
+  util.return %add : index
+}
+
+// CHECK: util.func public @nonuniform_arg_caller_flipped(%[[CALLER_ARG0:.+]]: index)
+util.func public @nonuniform_arg_caller_flipped(%arg0: index) -> (index, index) {
+  // CHECK-DAG: %[[C10:.+]] = arith.constant 10
+  %c10 = arith.constant 10 : index
+  // CHECK: %[[RET0:.+]] = util.call @nonuniform_constant_arg_callee_flipped(%[[C10]]) : (index) -> index
+  %ret0 = util.call @nonuniform_constant_arg_callee_flipped(%c10) : (index) -> index
+  // CHECK: %[[RET1:.+]] = util.call @nonuniform_constant_arg_callee_flipped(%[[CALLER_ARG0]]) : (index) -> index
+  %ret1 = util.call @nonuniform_constant_arg_callee_flipped(%arg0) : (index) -> index
+  // CHECK: util.return %[[RET0]], %[[RET1]]
+  util.return %ret0, %ret1 : index, index
 }
 
 // -----
 
 // Tests that non-uniform call results don't get optimized.
 
-// CHECK-LABEL: func.func private @nonuniform_result_callee
+// CHECK-LABEL: util.func private @nonuniform_result_callee
 // CHECK-SAME: (%[[ARG0:.+]]: i1) -> index
-func.func private @nonuniform_result_callee(%arg0: i1) -> index {
+util.func private @nonuniform_result_callee(%arg0: i1) -> index {
   cf.cond_br %arg0, ^bb1, ^bb2
 ^bb1:
   // CHECK: %[[C0:.+]] = arith.constant 0
   %c0 = arith.constant 0 : index
-  // CHECK: return %[[C0]]
-  return %c0 : index
+  // CHECK: util.return %[[C0]]
+  util.return %c0 : index
 ^bb2:
   // CHECK: %[[C1:.+]] = arith.constant 1
   %c1 = arith.constant 1 : index
-  // CHECK: return %[[C1]]
-  return %c1 : index
+  // CHECK: util.return %[[C1]]
+  util.return %c1 : index
 }
 
-// CHECK: func.func @nonuniform_result_caller(%[[ARG0:.+]]: i1)
-func.func @nonuniform_result_caller(%arg0: i1) -> index {
-  // CHECK: %[[RET0:.+]] = call @nonuniform_result_callee(%[[ARG0]]) : (i1) -> index
-  %ret0 = call @nonuniform_result_callee(%arg0) : (i1) -> index
-  // CHECK: return %[[RET0]]
-  return %ret0 : index
+// CHECK: util.func public @nonuniform_result_caller(%[[ARG0:.+]]: i1)
+util.func public @nonuniform_result_caller(%arg0: i1) -> index {
+  // CHECK: %[[RET0:.+]] = util.call @nonuniform_result_callee(%[[ARG0]]) : (i1) -> index
+  %ret0 = util.call @nonuniform_result_callee(%arg0) : (i1) -> index
+  // CHECK: util.return %[[RET0]]
+  util.return %ret0 : index
 }
 
 // -----
@@ -361,18 +412,18 @@
 // Tests that args that directly pass-through to results get hoisted out into
 // the caller.
 
-// CHECK-LABEL: func.func private @passthrough_callee() {
-func.func private @passthrough_callee(%arg0: index) -> index {
-  // CHECK: return
-  return %arg0 : index
+// CHECK-LABEL: util.func private @passthrough_callee() {
+util.func private @passthrough_callee(%arg0: index) -> index {
+  // CHECK: util.return
+  util.return %arg0 : index
 }
 
-// CHECK: func.func @passthrough_caller(%[[ARG0:.+]]: index)
-func.func @passthrough_caller(%arg0: index) -> index {
+// CHECK: util.func public @passthrough_caller(%[[ARG0:.+]]: index)
+util.func public @passthrough_caller(%arg0: index) -> index {
   // CHECK: call @passthrough_callee() : () -> ()
-  %ret0 = call @passthrough_callee(%arg0) : (index) -> index
-  // CHECK: return %[[ARG0]]
-  return %ret0 : index
+  %ret0 = util.call @passthrough_callee(%arg0) : (index) -> index
+  // CHECK: util.return %[[ARG0]]
+  util.return %ret0 : index
 }
 
 // -----
@@ -380,19 +431,19 @@
 // Tests that args that directly pass-through to results get hoisted out into
 // the caller but they are preserved as args if they are used for other things.
 
-// CHECK-LABEL: func.func private @passthrough_preserve_arg_callee
+// CHECK-LABEL: util.func private @passthrough_preserve_arg_callee
 // CHECK-SAME: (%[[ARG0:.+]]: index) -> index {
-func.func private @passthrough_preserve_arg_callee(%arg0: index) -> (index, index) {
+util.func private @passthrough_preserve_arg_callee(%arg0: index) -> (index, index) {
   // CHECK: %[[ADD:.+]] = arith.addi %[[ARG0]], %[[ARG0]]
   %add = arith.addi %arg0, %arg0 : index
-  // CHECK: return %[[ADD]]
-  return %arg0, %add : index, index
+  // CHECK: util.return %[[ADD]]
+  util.return %arg0, %add : index, index
 }
 
-// CHECK: func.func @passthrough_preserve_arg_caller(%[[ARG0:.+]]: index)
-func.func @passthrough_preserve_arg_caller(%arg0: index) -> (index, index) {
-  // CHECK: %[[RET1:.+]] = call @passthrough_preserve_arg_callee(%[[ARG0]]) : (index) -> index
-  %ret:2 = call @passthrough_preserve_arg_callee(%arg0) : (index) -> (index, index)
-  // CHECK: return %[[ARG0]], %[[RET1]]
-  return %ret#0, %ret#1 : index, index
+// CHECK: util.func public @passthrough_preserve_arg_caller(%[[ARG0:.+]]: index)
+util.func public @passthrough_preserve_arg_caller(%arg0: index) -> (index, index) {
+  // CHECK: %[[RET1:.+]] = util.call @passthrough_preserve_arg_callee(%[[ARG0]]) : (index) -> index
+  %ret:2 = util.call @passthrough_preserve_arg_callee(%arg0) : (index) -> (index, index)
+  // CHECK: util.return %[[ARG0]], %[[RET1]]
+  util.return %ret#0, %ret#1 : index, index
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/outline_constants.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/outline_constants.mlir
index b43f0fb..76b27da 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/outline_constants.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/outline_constants.mlir

@@ -1,19 +1,19 @@
 // RUN: iree-opt --split-input-file --iree-util-outline-constants %s | FileCheck %s
 
 // CHECK-LABEL: @scalarConstant
-func.func @scalarConstant() {
+util.func @scalarConstant() {
   // CHECK: = arith.constant 0 : i32
   %cst = arith.constant 0 : i32
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @splatConstant
-func.func @splatConstant() {
+util.func @splatConstant() {
   // CHECK: = arith.constant dense<1.200000e+00> : tensor<512x128xf32>
   %cst = arith.constant dense<1.2> : tensor<512x128xf32>
-  return
+  util.return
 }
 
 // -----
@@ -21,10 +21,10 @@
 //       CHECK: util.global private @_constant {inlining_policy = #util.inline.never} = dense<[0.0287729427, 0.0297581609]> : tensor<2xf32>
 //  CHECK-NEXT: util.global private @_constant_0 {inlining_policy = #util.inline.never} = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00, 7.000000e+00]> : tensor<8xf32>
 // CHECK-LABEL: @denseConstants
-func.func @denseConstants() {
+util.func @denseConstants() {
   // CHECK: = util.global.load @_constant : tensor<2xf32>
   %cst_0 = arith.constant dense<[0.0287729427, 0.0297581609]> : tensor<2xf32>
   // CHECK-NEXT: = util.global.load @_constant_0 : tensor<8xf32>
   %cst_1 = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]> : tensor<8xf32>
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/patterns.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/patterns.mlir
index cac5804..becfa2a 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/patterns.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/patterns.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @foldBrArguments
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[ARG1:.+]]: index)
-func.func @foldBrArguments(%cond: i1, %arg1: index) -> index {
+util.func @foldBrArguments(%cond: i1, %arg1: index) -> index {
   // CHECK: cf.cond_br %[[COND]]
   cf.cond_br %cond, ^bb1, ^bb2
 ^bb1:
@@ -19,15 +19,15 @@
 ^bb3(%bb3_0: index, %bb3_1: index, %bb3_2: index, %bb3_3: index):
   // CHECK: %[[OP3:.+]] = "some.op3"(%[[BB3_ARG0]], %[[BB3_ARG1]], %[[BB3_ARG2]], %[[BB3_ARG1]])
   %2 = "some.op3"(%bb3_0, %bb3_1, %bb3_2, %bb3_3) : (index, index, index, index) -> index
-  // CHECK: return %[[OP3]]
-  return %2 : index
+  // CHECK: util.return %[[OP3]]
+  util.return %2 : index
 }
 
 // -----
 
 // CHECK-LABEL: @foldCondBrArguments
 // CHECK-SAME: (%[[COND:.+]]: i1, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
-func.func @foldCondBrArguments(%cond: i1, %arg1: index, %arg2: index) -> index {
+util.func @foldCondBrArguments(%cond: i1, %arg1: index, %arg2: index) -> index {
   // CHECK: cf.cond_br %[[COND]], ^bb1, ^bb2
   cf.cond_br %cond, ^bb1(%arg1, %arg2, %arg2 : index, index, index),
                  ^bb2(%arg1, %arg1, %arg2 : index, index, index)
@@ -36,20 +36,20 @@
   // CHECK: %[[OP1:.+]] = "some.op1"(%[[ARG1]], %[[ARG2]], %[[ARG2]])
   %0 = "some.op1"(%bb1_0, %bb1_1, %bb1_2) : (index, index, index) -> index
   // CHECK: %[[OP1]]
-  return %0 : index
+  util.return %0 : index
   // CHECK: ^bb2:
 ^bb2(%bb2_0: index, %bb2_1: index, %bb2_2: index):
   // CHECK: %[[OP2:.+]] = "some.op2"(%[[ARG1]], %[[ARG1]], %[[ARG2]])
   %1 = "some.op2"(%bb2_0, %bb2_1, %bb2_2) : (index, index, index) -> index
-  // CHECK: return %[[OP2]]
-  return %1 : index
+  // CHECK: util.return %[[OP2]]
+  util.return %1 : index
 }
 
 // -----
 
 // CHECK-LABEL: @elideBranchOperands
 // CHECK-SAME: (%[[ARG0:.+]]: index, %[[ARG1:.+]]: index)
-func.func @elideBranchOperands(%arg0: index, %arg1: index) -> i32 {
+util.func @elideBranchOperands(%arg0: index, %arg1: index) -> i32 {
   // CHECK-DAG: %[[C5I32:.+]] = arith.constant 5 : i32
   // CHECK-DAG: %[[C1I32:.+]] = arith.constant 1 : i32
   // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
@@ -75,15 +75,15 @@
   cf.br ^loopHeader(%newValue, %newCounter, %bodyMax : i32, index, index)
   // CHECK: ^bb3:
 ^exit(%finalValue: i32):
-  // CHECK: return %[[BB1_ARG0]]
-  return %finalValue : i32
+  // CHECK: util.return %[[BB1_ARG0]]
+  util.return %finalValue : i32
 }
 
 // -----
 
 // CHECK-LABEL: @indexSwitchToIf
 // CHECK-SAME: (%[[CASE:.+]]: index)
-func.func @indexSwitchToIf(%case: index) -> (i32, i64) {
+util.func @indexSwitchToIf(%case: index) -> (i32, i64) {
   // CHECK-NOT: scf.index_switch
   // CHECK: %[[C1:.+]] = arith.constant 1 : index
   // CHECK: %[[IS_CASE_1:.+]] = arith.cmpi eq, %[[CASE]], %[[C1]]
@@ -106,15 +106,15 @@
     // CHECK-NEXT: scf.yield %[[DEFAULT_A]], %[[DEFAULT_B]]
     scf.yield %default_a, %default_b : i32, i64
   }
-  // CHECK: return %[[RESULTS]]#0, %[[RESULTS]]#1
-  return %results#0, %results#1 : i32, i64
+  // CHECK: util.return %[[RESULTS]]#0, %[[RESULTS]]#1
+  util.return %results#0, %results#1 : i32, i64
 }
 
 // -----
 
 // CHECK-LABEL: @indexSwitchToIfNoResult
 // CHECK-SAME: (%[[CASE:.+]]: index)
-func.func @indexSwitchToIfNoResult(%case: index) {
+util.func @indexSwitchToIfNoResult(%case: index) {
   // CHECK-NOT: scf.index_switch
   // CHECK: %[[C1:.+]] = arith.constant 1 : index
   // CHECK: %[[IS_CASE_1:.+]] = arith.cmpi eq, %[[CASE]], %[[C1]]
@@ -131,15 +131,15 @@
     "some.op.default"() : () -> ()
     scf.yield
   }
-  // CHECK: return
-  return
+  // CHECK: util.return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @indexSwitchToIfNoDefault
 // CHECK-SAME: (%[[CASE:.+]]: index)
-func.func @indexSwitchToIfNoDefault(%case: index) {
+util.func @indexSwitchToIfNoDefault(%case: index) {
   // CHECK-NOT: scf.index_switch
   // CHECK: %[[C1:.+]] = arith.constant 1 : index
   // CHECK: %[[IS_CASE_1:.+]] = arith.cmpi eq, %[[CASE]], %[[C1]]
@@ -153,15 +153,15 @@
   // CHECK-NOT: } else {
   default {
   }
-  // CHECK: return
-  return
+  // CHECK: util.return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @mergeIndexSwitches
 // CHECK-SAME: (%[[CASE:.+]]: index)
-func.func @mergeIndexSwitches(%case: index) -> (i32, i32) {
+util.func @mergeIndexSwitches(%case: index) -> (i32, i32) {
   // CHECK: %[[RESULTS:.+]]:2 = scf.index_switch %[[CASE]] -> i32, i32
   %result0 = scf.index_switch %case -> i32
   // CHECK-NEXT: case 0 {
@@ -204,15 +204,15 @@
     %default = "some.op1.default"(%result0) : (i32) -> i32
     scf.yield %default : i32
   }
-  // CHECK: return %[[RESULTS]]#0, %[[RESULTS]]#1
-  return %result0, %result1 : i32, i32
+  // CHECK: util.return %[[RESULTS]]#0, %[[RESULTS]]#1
+  util.return %result0, %result1 : i32, i32
 }
 
 // -----
 
 // CHECK-LABEL: @mergeIndexSwitchesNoResult
 // CHECK-SAME: (%[[CASE:.+]]: index)
-func.func @mergeIndexSwitchesNoResult(%case: index) {
+util.func @mergeIndexSwitchesNoResult(%case: index) {
   // CHECK: scf.index_switch %[[CASE]]
   scf.index_switch %case
   // CHECK-NEXT: case 0 {
@@ -253,14 +253,14 @@
   default {
     "some.op1.default"() : () -> ()
   }
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @mergeIndexSwitchesIntoEmptyDefault
 // CHECK-SAME: (%[[CASE:.+]]: index)
-func.func @mergeIndexSwitchesIntoEmptyDefault(%case: index) {
+util.func @mergeIndexSwitchesIntoEmptyDefault(%case: index) {
   // CHECK: scf.index_switch %[[CASE]]
   scf.index_switch %case
   // CHECK-NEXT: case 0 {
@@ -295,5 +295,5 @@
   }
   default {
   }
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/promote_bf16_to_f32.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/promote_bf16_to_f32.mlir
index be8edc0..9080f63 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/promote_bf16_to_f32.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/promote_bf16_to_f32.mlir

@@ -1,26 +1,26 @@
 // RUN: iree-opt --split-input-file --iree-util-promote-bf16-to-f32 %s | FileCheck %s
 
 // CHECK: util.global {{.*}} : tensor<4xf32>
-// CHECK-LABEL: func.func @simple_bf16() -> tensor<4xf32>
+// CHECK-LABEL: util.func public @simple_bf16() -> tensor<4xf32>
 // CHECK-NEXT: %{{.*}} = util.global.address @__global : !util.ptr<tensor<4xf32>>
 // CHECK-NEXT: %{{.*}} = util.global.load.indirect %{{.*}} : !util.ptr<tensor<4xf32>> -> tensor<4xf32>
-// CHECK-NEXT: return %{{.*}} : tensor<4xf32>
+// CHECK-NEXT: util.return %{{.*}} : tensor<4xf32>
 util.global private @"__global" = dense<[1.000000e+01, 5.000000e+00, 1.000000e+01, 5.000000e+00]> : tensor<4xbf16>
-func.func @simple_bf16() -> (tensor<4xbf16>) {
+util.func public @simple_bf16() -> (tensor<4xbf16>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xbf16>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xbf16>> -> tensor<4xbf16>
-  return %1 : tensor<4xbf16>
+  util.return %1 : tensor<4xbf16>
 }
 
 // -----
 
 // CHECK: util.global
 // CHECK-NOT: bf16
-// CHECK-LABEL: func.func @nested_region_bf16()
+// CHECK-LABEL: util.func public @nested_region_bf16()
 // CHECK-NOT: bf16
-// CHECK: return %{{.*}} : tensor<?xf32>
+// CHECK: util.return %{{.*}} : tensor<?xf32>
 util.global private @"__global" = dense<[1.000000e+01, 5.000000e+00, 1.000000e+01, 5.000000e+00]> : tensor<4xbf16>
-func.func @nested_region_bf16() -> (tensor<?xbf16>) {
+util.func public @nested_region_bf16() -> (tensor<?xbf16>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xbf16>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xbf16>> -> tensor<4xbf16>
   %c4 = arith.constant 4 : index
@@ -29,5 +29,5 @@
     %element = tensor.extract %1[%arg0] : tensor<4xbf16>
     tensor.yield %element : bf16
   } : tensor<?xbf16>
-  return %2 : tensor<?xbf16>
+  util.return %2 : tensor<?xbf16>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/promote_f16_to_f32.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/promote_f16_to_f32.mlir
index e7ddbb5..40232c3 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/promote_f16_to_f32.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/promote_f16_to_f32.mlir

@@ -3,26 +3,26 @@
 // NOTE: for more comprehensive tests see demote_i64_to_i32.mlir.
 
 //       CHECK: util.global {{.*}} : tensor<4xf32>
-// CHECK-LABEL: func.func @simple_f16() -> tensor<4xf32>
+// CHECK-LABEL: util.func public @simple_f16() -> tensor<4xf32>
 //  CHECK-NEXT: %{{.*}} = util.global.address @__global : !util.ptr<tensor<4xf32>>
 //  CHECK-NEXT: %{{.*}} = util.global.load.indirect %{{.*}} : !util.ptr<tensor<4xf32>> -> tensor<4xf32>
-//  CHECK-NEXT: return %{{.*}} : tensor<4xf32>
+//  CHECK-NEXT: util.return %{{.*}} : tensor<4xf32>
 util.global private @"__global" = dense<[1.000000e+01, 5.000000e+00, 1.000000e+01, 5.000000e+00]> : tensor<4xf16>
-func.func @simple_f16() -> (tensor<4xf16>) {
+util.func public @simple_f16() -> (tensor<4xf16>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xf16>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xf16>> -> tensor<4xf16>
-  return %1 : tensor<4xf16>
+  util.return %1 : tensor<4xf16>
 }
 
 // -----
 
 // CHECK: util.global
 // CHECK-NOT: f16
-// CHECK-LABEL: func.func @nested_region_f16()
+// CHECK-LABEL: util.func public @nested_region_f16()
 // CHECK-NOT: f16
-// CHECK: return %{{.*}} : tensor<?xf32>
+// CHECK: util.return %{{.*}} : tensor<?xf32>
 util.global private @"__global" = dense<[1.000000e+01, 5.000000e+00, 1.000000e+01, 5.000000e+00]> : tensor<4xf16>
-func.func @nested_region_f16() -> (tensor<?xf16>) {
+util.func public @nested_region_f16() -> (tensor<?xf16>) {
   %0 = util.global.address @"__global" : !util.ptr<tensor<4xf16>>
   %1 = util.global.load.indirect %0 : !util.ptr<tensor<4xf16>> -> tensor<4xf16>
   %c4 = arith.constant 4 : index
@@ -31,5 +31,5 @@
     %element = tensor.extract %1[%arg0] : tensor<4xf16>
     tensor.yield %element : f16
   } : tensor<?xf16>
-  return %2 : tensor<?xf16>
+  util.return %2 : tensor<?xf16>
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/propagate_subranges.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/propagate_subranges.mlir
index 81cbff3..c188fed 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/propagate_subranges.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/propagate_subranges.mlir

@@ -11,7 +11,7 @@
 util.global private mutable @constantGlobal : !util.buffer
 
 // CHECK-LABEL: @globalLoad
-func.func private @globalLoad() {
+util.func private @globalLoad() {
   // CHECK-NEXT: %[[RESOURCE:.+]] = util.global.load @constantGlobal : !util.buffer
   // CHECK-NEXT: %[[STORAGE_SIZE:.+]] = util.global.load @constantGlobal__storage_size : index
   // CHECK-NEXT: %[[OFFSET:.+]] = util.global.load @constantGlobal__offset : index
@@ -20,7 +20,7 @@
   %0 = util.global.load @constantGlobal : !util.buffer
   // CHECK-NEXT: util.optimization_barrier %[[SUBRANGE]]
   util.optimization_barrier %0 : !util.buffer
-  return
+  util.return
 }
 
 // -----
@@ -37,13 +37,13 @@
 
 // CHECK-LABEL: @globalStore
 // CHECK-SAME: (%[[RESOURCE:.+]]: !util.buffer, %[[STORAGE_SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[LENGTH:.+]]: index)
-func.func private @globalStore(%resource: !util.buffer) {
+util.func private @globalStore(%resource: !util.buffer) {
   // CHECK: util.global.store %[[RESOURCE]], @mutableGlobal : !util.buffer
   // CHECK: util.global.store %[[STORAGE_SIZE]], @mutableGlobal__storage_size : index
   // CHECK: util.global.store %[[OFFSET]], @mutableGlobal__offset : index
   // CHECK: util.global.store %[[LENGTH]], @mutableGlobal__length : index
   util.global.store %resource, @mutableGlobal : !util.buffer
-  return
+  util.return
 }
 
 // -----
@@ -55,7 +55,7 @@
 
 // CHECK-LABEL: @funcArgs
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !util.buffer, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !util.buffer, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
-func.func private @funcArgs(%resource0: !util.buffer, %resource1: !util.buffer) {
+util.func private @funcArgs(%resource0: !util.buffer, %resource1: !util.buffer) {
   // CHECK-NEXT: %[[SUBRANGE0:.+]] = util.buffer.subspan %[[RESOURCE0]][%[[OFFSET0]]] : !util.buffer{%[[STORAGE_SIZE0]]} -> !util.buffer{%[[LENGTH0]]}
   // CHECK-NEXT: %[[SUBRANGE1:.+]] = util.buffer.subspan %[[RESOURCE1]][%[[OFFSET1]]] : !util.buffer{%[[STORAGE_SIZE1]]} -> !util.buffer{%[[LENGTH1]]}
 
@@ -63,7 +63,8 @@
   util.optimization_barrier %resource0 : !util.buffer
   // CHECK-NEXT: util.optimization_barrier %[[SUBRANGE1]]
   util.optimization_barrier %resource1 : !util.buffer
-  return
+
+  util.return
 }
 
 // -----
@@ -76,13 +77,13 @@
 // CHECK-LABEL: @funcResults
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !util.buffer, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !util.buffer, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
 // CHECK-SAME: -> (!util.buffer, index, index, index, !util.buffer, index, index, index)
-func.func private @funcResults(%resource0: !util.buffer, %resource1: !util.buffer) -> (!util.buffer, !util.buffer) {
+util.func private @funcResults(%resource0: !util.buffer, %resource1: !util.buffer) -> (!util.buffer, !util.buffer) {
   // NOTE: there will be extra stuff here from the arg insertion. Since the
   // return should consume the subrange that was inserted we expect to directly
   // use the function arguments.
 
-  // CHECK: return %[[RESOURCE0]], %[[STORAGE_SIZE0]], %[[OFFSET0]], %[[LENGTH0]], %[[RESOURCE1]], %[[STORAGE_SIZE1]], %[[OFFSET1]], %[[LENGTH1]]
-  return %resource0, %resource1 : !util.buffer, !util.buffer
+  // CHECK: util.return %[[RESOURCE0]], %[[STORAGE_SIZE0]], %[[OFFSET0]], %[[LENGTH0]], %[[RESOURCE1]], %[[STORAGE_SIZE1]], %[[OFFSET1]], %[[LENGTH1]]
+  util.return %resource0, %resource1 : !util.buffer, !util.buffer
 }
 
 
@@ -92,9 +93,9 @@
 
 // CHECK-LABEL: @publicFuncSignature
 // CHECK-SAME: (%[[RESOURCE:.+]]: !util.buffer) -> !util.buffer
-func.func @publicFuncSignature(%resource: !util.buffer) -> !util.buffer {
-  // CHECK-NEXT: return %[[RESOURCE]] : !util.buffer
-  return %resource : !util.buffer
+util.func @publicFuncSignature(%resource: !util.buffer) -> !util.buffer {
+  // CHECK-NEXT: util.return %[[RESOURCE]] : !util.buffer
+  util.return %resource : !util.buffer
 }
 
 // -----
@@ -107,15 +108,15 @@
 
 // CHECK-LABEL: @caller
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !util.buffer, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !util.buffer, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
-func.func private @caller(%resource0: !util.buffer, %resource1: !util.buffer) {
+util.func private @caller(%resource0: !util.buffer, %resource1: !util.buffer) {
   // NOTE: there will be extra stuff here from the arg insertion. The call
   // consumes the subranges and we expect the args to be passed directly.
 
-  // CHECK: %[[RET:.+]]:8 = call @callee(%[[RESOURCE0]], %[[STORAGE_SIZE0]], %[[OFFSET0]], %[[LENGTH0]],
+  // CHECK: %[[RET:.+]]:8 = util.call @callee(%[[RESOURCE0]], %[[STORAGE_SIZE0]], %[[OFFSET0]], %[[LENGTH0]],
   // CHECK-SAME:                         %[[RESOURCE1]], %[[STORAGE_SIZE1]], %[[OFFSET1]], %[[LENGTH1]])
   // CHECK-SAME: : (!util.buffer, index, index, index, !util.buffer, index, index, index)
   // CHECK-SAME: -> (!util.buffer, index, index, index, !util.buffer, index, index, index)
-  %0:2 = call @callee(%resource0, %resource1) : (!util.buffer, !util.buffer) -> (!util.buffer, !util.buffer)
+  %0:2 = util.call @callee(%resource0, %resource1) : (!util.buffer, !util.buffer) -> (!util.buffer, !util.buffer)
   // CHECK-NEXT: %[[RET_SUBRANGE0:.+]] = util.buffer.subspan %[[RET]]#0[%[[RET]]#2] : !util.buffer{%[[RET]]#1} -> !util.buffer{%[[RET]]#3}
   // CHECK-NEXT: %[[RET_SUBRANGE1:.+]] = util.buffer.subspan %[[RET]]#4[%[[RET]]#6] : !util.buffer{%[[RET]]#5} -> !util.buffer{%[[RET]]#7}
 
@@ -124,11 +125,11 @@
   // CHECK-NEXT: util.optimization_barrier %[[RET_SUBRANGE1]] : !util.buffer
   util.optimization_barrier %0#1 : !util.buffer
 
-  return
+  util.return
 }
 
-func.func private @callee(%arg0: !util.buffer, %arg1: !util.buffer) -> (!util.buffer, !util.buffer) {
-  return %arg0, %arg1 : !util.buffer, !util.buffer
+util.func private @callee(%arg0: !util.buffer, %arg1: !util.buffer) -> (!util.buffer, !util.buffer) {
+  util.return %arg0, %arg1 : !util.buffer, !util.buffer
 }
 
 // -----
@@ -139,21 +140,21 @@
 
 // CHECK-LABEL: @callerInSCF
 // CHECK-SAME: (%[[RESOURCE:.+]]: !util.buffer, %[[STORAGE_SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[LENGTH:.+]]: index, %[[COND:.+]]: i1)
-func.func private @callerInSCF(%resource: !util.buffer, %cond: i1) {
+util.func private @callerInSCF(%resource: !util.buffer, %cond: i1) {
   // NOTE: there will be extra stuff here from the arg insertion. The call
   // consumes the subranges and we expect the args to be passed directly.
 
   // CHECK: scf.if %[[COND]]
   scf.if %cond {
-    // CHECK: func.call @callee(%[[RESOURCE]], %[[STORAGE_SIZE]], %[[OFFSET]], %[[LENGTH]])
-    func.call @callee(%resource) : (!util.buffer) -> ()
+    // CHECK: util.call @callee(%[[RESOURCE]], %[[STORAGE_SIZE]], %[[OFFSET]], %[[LENGTH]])
+    util.call @callee(%resource) : (!util.buffer) -> ()
   }
 
-  return
+  util.return
 }
 
-func.func private @callee(%arg0: !util.buffer) {
-  return
+util.func private @callee(%arg0: !util.buffer) {
+  util.return
 }
 
 // -----
@@ -165,7 +166,7 @@
 
 // CHECK-LABEL: @callerWithSubrange
 // CHECK-SAME: (%[[ARG_RESOURCE:.+]]: !util.buffer, %[[ARG_SIZE:.+]]: index, %[[ARG_OFFSET:.+]]: index, %[[ARG_LENGTH:.+]]: index)
-func.func private @callerWithSubrange(%arg: !util.buffer) {
+util.func private @callerWithSubrange(%arg: !util.buffer) {
   // NOTE: there will be extra stuff here from the arg insertion. The call
   // consumes the subranges and we expect the args to be passed directly.
 
@@ -177,8 +178,8 @@
   // CHECK-DAG: %[[ARG_ADJUSTED_OFFSET:.+]] = arith.addi %[[ARG_OFFSET]], %[[ARG_LOCAL_OFFSET]]
   %arg_subspan = util.buffer.subspan %arg[%arg_offset] : !util.buffer{%arg_size} -> !util.buffer{%arg_length}
 
-  // CHECK: %[[RET0:.+]]:4 = call @callee(%[[ARG_RESOURCE]], %[[ARG_SIZE]], %[[ARG_ADJUSTED_OFFSET]], %[[ARG_LOCAL_LENGTH]])
-  %ret0 = call @callee(%arg_subspan) : (!util.buffer) -> (!util.buffer)
+  // CHECK: %[[RET0:.+]]:4 = util.call @callee(%[[ARG_RESOURCE]], %[[ARG_SIZE]], %[[ARG_ADJUSTED_OFFSET]], %[[ARG_LOCAL_LENGTH]])
+  %ret0 = util.call @callee(%arg_subspan) : (!util.buffer) -> (!util.buffer)
 
   %ret0_size = util.buffer.size %ret0 : !util.buffer
   // CHECK-DAG: %[[RET0_LOCAL_OFFSET:.+]] = arith.constant 300
@@ -188,18 +189,18 @@
   // CHECK-DAG: %[[RET0_ADJUSTED_OFFSET:.+]] = arith.addi %[[RET0]]#2, %[[RET0_LOCAL_OFFSET]]
   %ret0_subspan = util.buffer.subspan %ret0[%ret0_offset] : !util.buffer{%ret0_size} -> !util.buffer{%ret0_length}
 
-  // CHECK: %[[RET1:.+]]:4 = call @callee(%[[RET0]]#0, %[[RET0]]#1, %[[RET0_ADJUSTED_OFFSET]], %[[RET0_LOCAL_LENGTH]])
-  %ret1 = call @callee(%ret0_subspan) : (!util.buffer) -> (!util.buffer)
+  // CHECK: %[[RET1:.+]]:4 = util.call @callee(%[[RET0]]#0, %[[RET0]]#1, %[[RET0_ADJUSTED_OFFSET]], %[[RET0_LOCAL_LENGTH]])
+  %ret1 = util.call @callee(%ret0_subspan) : (!util.buffer) -> (!util.buffer)
   // CHECK: %[[RET1_SUBRANGE:.+]] = util.buffer.subspan %[[RET1]]#0[%[[RET1]]#2] : !util.buffer{%[[RET1]]#1} -> !util.buffer{%[[RET1]]#3}
 
   // CHECK-NEXT: util.optimization_barrier %[[RET1_SUBRANGE]] : !util.buffer
   util.optimization_barrier %ret1 : !util.buffer
 
-  return
+  util.return
 }
 
-func.func private @callee(%arg0: !util.buffer) -> !util.buffer {
-  return %arg0 : !util.buffer
+util.func private @callee(%arg0: !util.buffer) -> !util.buffer {
+  util.return %arg0 : !util.buffer
 }
 
 // -----
@@ -211,7 +212,7 @@
 
 // CHECK-LABEL: @br
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !util.buffer, %[[STORAGE_SIZE0:.+]]: index, %[[OFFSET0:.+]]: index, %[[LENGTH0:.+]]: index, %[[RESOURCE1:.+]]: !util.buffer, %[[STORAGE_SIZE1:.+]]: index, %[[OFFSET1:.+]]: index, %[[LENGTH1:.+]]: index)
-func.func private @br(%resource0: !util.buffer, %resource1: !util.buffer) {
+util.func private @br(%resource0: !util.buffer, %resource1: !util.buffer) {
   // NOTE: there will be extra stuff here from the arg insertion. The branch
   // consumes the unready resources and we expect the args to be passed directly
   // to the cf.br.
@@ -230,5 +231,5 @@
   // CHECK-NEXT: util.optimization_barrier %[[BB1_SUBRANGE1]]
   util.optimization_barrier %bb1_resource1 : !util.buffer
 
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/simplify_global_accesses.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/simplify_global_accesses.mlir
index 9e2077a..49abd98 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/simplify_global_accesses.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/simplify_global_accesses.mlir

@@ -1,10 +1,10 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-util-simplify-global-accesses))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(util.func(iree-util-simplify-global-accesses))' %s | FileCheck %s
 
 util.global private @varA = dense<1> : tensor<2xi32>
 util.global private @varB = dense<3> : tensor<2x4xi32>
 
 // CHECK-LABEL: @constants()
-func.func @constants() {
+util.func public @constants() {
   // CHECK-DAG: constant 10
   %w = arith.constant 10 : index
   // CHECK-DAG: %[[VAR_A:.+]] = util.global.load @varA : tensor<2xi32>
@@ -15,7 +15,7 @@
   %varB = util.global.load @varB : tensor<2x4xi32>
   // CHECK-NEXT: flow.dispatch @ex::@dispatch1{{.+}}(%[[T]], %[[VAR_B]])
   %d1 = flow.dispatch @ex::@dispatch1[%w](%d0, %varB) : (tensor<2xi32>, tensor<2x4xi32>) -> tensor<2xi32>
-  return
+  util.return
 }
 
 // -----
@@ -24,7 +24,7 @@
 util.global private @varB = 2 : i32
 
 // CHECK-LABEL: @constants_in_cfg
-func.func @constants_in_cfg(%start: i32, %bound: i32) -> i32 {
+util.func public @constants_in_cfg(%start: i32, %bound: i32) -> i32 {
   // CHECK-NEXT: %[[VAR_A:.+]] = util.global.load @varA : i32
   // CHECK-NEXT: %[[VAR_B:.+]] = util.global.load @varB : i32
   // CHECK-NEXT: cf.br ^bb1
@@ -47,8 +47,8 @@
   %11 = util.global.load @varB : i32
   // CHECK-NEXT: %[[T1:.+]] = arith.subi %[[T0]], %[[VAR_B]]
   %12 = arith.subi %10, %11 : i32
-  // CHECK-NEXT: return %[[T1]]
-  return %12 : i32
+  // CHECK-NEXT: util.return %[[T1]]
+  util.return %12 : i32
 }
 
 // -----
@@ -57,7 +57,7 @@
 util.global private @varB = dense<3> : tensor<2x4xi32>
 
 // CHECK-LABEL: @mixed_mutability
-func.func @mixed_mutability() {
+util.func public @mixed_mutability() {
   // CHECK-DAG: %[[VAR_A:.+]] = util.global.load @varA : tensor<2xi32>
   // CHECK-DAG: %[[VAR_B:.+]] = util.global.load @varB : tensor<2x4xi32>
   // CHECK-NEXT: constant 10
@@ -70,7 +70,7 @@
   %d1 = flow.dispatch @ex::@dispatch1[%w](%d0, %varB) : (tensor<2xi32>, tensor<2x4xi32>) -> tensor<2xi32>
   // CHECK-NEXT: util.global.store %[[T1]], @varA : tensor<2xi32>
   util.global.store %d1, @varA : tensor<2xi32>
-  return
+  util.return
 }
 
 // -----
@@ -78,14 +78,14 @@
 util.global private mutable @varA = dense<1> : tensor<2xi32>
 
 // CHECK-LABEL: @raw
-func.func @raw() {
+util.func public @raw() {
   // CHECK: %[[T:.+]] = util.global.load @varA {id = 0
   %varA_0 = util.global.load @varA {id = 0} : tensor<2xi32>
   util.global.store %varA_0, @varA {id = 0} : tensor<2xi32>
   %varA_1 = util.global.load @varA {id = 1} : tensor<2xi32>
   // CHECK-NEXT: util.global.store %[[T]], @varA {id = 1
   util.global.store %varA_1, @varA {id = 1} : tensor<2xi32>
-  return
+  util.return
 }
 
 // -----
@@ -93,12 +93,12 @@
 util.global private mutable @varA = dense<1> : tensor<2xi32>
 
 // CHECK-LABEL: @rar
-func.func @rar() -> (tensor<2xi32>, tensor<2xi32>) {
+util.func public @rar() -> (tensor<2xi32>, tensor<2xi32>) {
   // CHECK: %[[T:.+]] = util.global.load @varA {id = 0
   %varA_0 = util.global.load @varA {id = 0} : tensor<2xi32>
   %varA_1 = util.global.load @varA {id = 1} : tensor<2xi32>
-  // CHECK-NEXT: return %[[T]], %[[T]]
-  return %varA_0, %varA_1 : tensor<2xi32>, tensor<2xi32>
+  // CHECK-NEXT: util.return %[[T]], %[[T]]
+  util.return %varA_0, %varA_1 : tensor<2xi32>, tensor<2xi32>
 }
 
 // -----
@@ -107,11 +107,11 @@
 
 // CHECK-LABEL: @waw
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<2xi32>, %[[ARG1:.+]]: tensor<2xi32>)
-func.func @waw(%varA_0: tensor<2xi32>, %varA_1: tensor<2xi32>) {
+util.func public @waw(%varA_0: tensor<2xi32>, %varA_1: tensor<2xi32>) {
   util.global.store %varA_0, @varA : tensor<2xi32>
   // CHECK-NEXT: util.global.store %[[ARG1]], @varA
   util.global.store %varA_1, @varA : tensor<2xi32>
-  return
+  util.return
 }
 
 // -----
@@ -119,21 +119,21 @@
 util.global private mutable @varA = dense<1> : tensor<2xi32>
 
 // CHECK-LABEL: @side_effects(
-func.func @side_effects() {
+util.func public @side_effects() {
   // CHECK-NEXT: %[[T0:.+]] = util.global.load @varA
   %varA_0 = util.global.load @varA : tensor<2xi32>
   // CHECK-NEXT: util.global.store %[[T0]], @varA
   util.global.store %varA_0, @varA : tensor<2xi32>
-  // CHECK-NEXT: call @other_fn()
-  call @other_fn() : () -> ()
+  // CHECK-NEXT: util.call @other_fn()
+  util.call @other_fn() : () -> ()
   // CHECK-NEXT: %[[T1:.+]] = util.global.load @varA
   %varA_1 = util.global.load @varA : tensor<2xi32>
   // CHECK-NEXT: util.global.store %[[T1]], @varA
   util.global.store %varA_1, @varA : tensor<2xi32>
-  return
+  util.return
 }
 
-func.func private @other_fn()
+util.func private @other_fn()
 
 // -----
 
@@ -141,7 +141,7 @@
 util.global private mutable @varB = dense<2> : tensor<2xi32>
 
 // CHECK-LABEL: @ordering
-func.func @ordering() {
+util.func public @ordering() {
   %cst_top = arith.constant 1 : index
   %varA_0 = util.global.load @varA {id = 0} : tensor<2xi32>
   util.global.store %varA_0, @varA {id = 0} : tensor<2xi32>
@@ -160,5 +160,5 @@
   // CHECK-NEXT: arith.constant
   // CHECK-DAG: util.global.store %[[T0]], @varA {id = 0
   // CHECK-DAG: util.global.store %[[T1]], @varB {id = 1
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/strip_debug_ops.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/strip_debug_ops.mlir
index fcf7d37..52894e3 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/strip_debug_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/strip_debug_ops.mlir

@@ -1,8 +1,8 @@
-// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-util-strip-debug-ops))' %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(util.func(iree-util-strip-debug-ops))' %s | FileCheck %s
 
 // CHECK-LABEL: @stripAssert
-func.func @stripAssert(%cond: i1) {
+util.func @stripAssert(%cond: i1) {
   // CHECK-NOT: cf.assert
   cf.assert %cond, "hello!"
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/test_float_range_analysis.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/test_float_range_analysis.mlir
index d5aa9f5..238567c 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/test_float_range_analysis.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/test_float_range_analysis.mlir

@@ -1,67 +1,67 @@
 // RUN: iree-opt --split-input-file --iree-util-test-float-range-analysis --allow-unregistered-dialect %s | FileCheck %s
 
 // CHECK-LABEL: @scalar_const_trunc
-func.func @scalar_const_trunc() -> f32 {
+util.func @scalar_const_trunc() -> f32 {
   %0 = arith.constant 5.0 : f32
   // CHECK: fp-range: [5.000000, 5.000000, TRUNC]
   %1 = "iree_unregistered.test_fprange"(%0) : (f32) -> f32
-  return %1 : f32
+  util.return %1 : f32
 }
 
 // -----
 // CHECK-LABEL: @scalar_const_non_trunc
-func.func @scalar_const_non_trunc() -> f32 {
+util.func @scalar_const_non_trunc() -> f32 {
   %0 = arith.constant 5.2 : f32
   // CHECK: fp-range: [5.200000, 5.200000, !trunc]
   %1 = "iree_unregistered.test_fprange"(%0) : (f32) -> f32
-  return %1 : f32
+  util.return %1 : f32
 }
 
 // -----
 // CHECK-LABEL: @scalar_non_float
-func.func @scalar_non_float() -> i32 {
+util.func @scalar_non_float() -> i32 {
   %0 = arith.constant 5 : i32
   // NOTE: The least-constrained value is returned for a non-fp type. It
   // is up to the user to ensure that we are requesting stats for fp types
   // and this represents the soft-failure mode if asking about an illegal type.
   // CHECK: fp-range: [-inf, inf, !trunc]
   %1 = "iree_unregistered.test_fprange"(%0) : (i32) -> i32
-  return %1 : i32
+  util.return %1 : i32
 }
 
 // -----
 // CHECK-LABEL: @tensor_const_trunc
-func.func @tensor_const_trunc() -> tensor<2xf32> {
+util.func @tensor_const_trunc() -> tensor<2xf32> {
   %0 = arith.constant dense<[-2.0, 2.0]> : tensor<2xf32>
   // CHECK: fp-range: [-2.000000, 2.000000, TRUNC]
   %1 = "iree_unregistered.test_fprange"(%0) : (tensor<2xf32>) -> tensor<2xf32>
-  return %1 : tensor<2xf32>
+  util.return %1 : tensor<2xf32>
 }
 
 // -----
 // CHECK-LABEL: @tensor_const_non_trunc
-func.func @tensor_const_non_trunc() -> tensor<2xf32> {
+util.func @tensor_const_non_trunc() -> tensor<2xf32> {
   %0 = arith.constant dense<[-1.2, 2.0]> : tensor<2xf32>
   // CHECK: fp-range: [-1.200000, 2.000000, !trunc]
   %1 = "iree_unregistered.test_fprange"(%0) : (tensor<2xf32>) -> tensor<2xf32>
-  return %1 : tensor<2xf32>
+  util.return %1 : tensor<2xf32>
 }
 
 // -----
 // CHECK-LABEL: @min_max_no_trunc
-func.func @min_max_no_trunc(%arg0 : f32) -> f32 {
+util.func @min_max_no_trunc(%arg0 : f32) -> f32 {
   %0 = arith.constant -5.0 : f32
   %1 = arith.constant 5.0 : f32
   %2 = arith.minimumf %arg0, %1 : f32
   %3 = arith.maximumf %2, %0 : f32
   // CHECK: fp-range: [-5.000000, 5.000000, !trunc]
   %result = "iree_unregistered.test_fprange"(%3) : (f32) -> f32
-  return %result : f32
+  util.return %result : f32
 }
 
 // -----
 // CHECK-LABEL: @min_max_floor
-func.func @min_max_floor(%arg0 : f32) -> f32 {
+util.func @min_max_floor(%arg0 : f32) -> f32 {
   %0 = arith.constant -5.0 : f32
   %1 = arith.constant 5.0 : f32
   %2 = arith.minimumf %arg0, %1 : f32
@@ -69,12 +69,12 @@
   %4 = math.floor %3 : f32
   // CHECK: fp-range: [-5.000000, 5.000000, TRUNC]
   %result = "iree_unregistered.test_fprange"(%4) : (f32) -> f32
-  return %result : f32
+  util.return %result : f32
 }
 
 // -----
 // CHECK-LABEL: @min_max_floor_adj_range
-func.func @min_max_floor_adj_range(%arg0 : f32) -> f32 {
+util.func @min_max_floor_adj_range(%arg0 : f32) -> f32 {
   %0 = arith.constant -5.2 : f32
   %1 = arith.constant 5.2 : f32
   %2 = arith.minimumf %arg0, %1 : f32
@@ -82,12 +82,12 @@
   %4 = math.floor %3 : f32
   // CHECK: fp-range: [-6.000000, 5.000000, TRUNC]
   %result = "iree_unregistered.test_fprange"(%4) : (f32) -> f32
-  return %result : f32
+  util.return %result : f32
 }
 
 // -----
 // CHECK-LABEL: @floor_min_max
-func.func @floor_min_max(%arg0 : f32) -> f32 {
+util.func @floor_min_max(%arg0 : f32) -> f32 {
   %0 = arith.constant -5.0 : f32
   %1 = arith.constant 5.0 : f32
   %2 = math.floor %arg0 : f32
@@ -95,5 +95,5 @@
   %4 = arith.minimumf %3, %1 : f32
   // CHECK: fp-range: [-5.000000, 5.000000, TRUNC]
   %result = "iree_unregistered.test_fprange"(%4) : (f32) -> f32
-  return %result : f32
+  util.return %result : f32
 }

diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/test_float_range_analysis_linalg.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/test_float_range_analysis_linalg.mlir
index e1a4003..a6e88e8 100644
--- a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/test_float_range_analysis_linalg.mlir
+++ b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/test_float_range_analysis_linalg.mlir

@@ -2,7 +2,7 @@
 
 #map0 = affine_map<(d0, d1) -> ()>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
-func.func @linalg_generic_traversal(%arg0 : tensor<5x6xf32>) -> (tensor<5x6xf32>, tensor<5x6xf32>, tensor<5x6xf32>) {
+util.func @linalg_generic_traversal(%arg0 : tensor<5x6xf32>) -> (tensor<5x6xf32>, tensor<5x6xf32>, tensor<5x6xf32>) {
   %cst_min = arith.constant dense<-1.270000e+02> : tensor<f32>
   %cst_max = arith.constant dense<1.270000e+02> : tensor<f32>
   %init = tensor.empty() : tensor<5x6xf32>
@@ -38,5 +38,5 @@
   %max_range = "iree_unregistered.test_fprange"(%max) : (tensor<5x6xf32>) -> tensor<5x6xf32>
   // CHECK: fp-range: [-inf, inf, TRUNC]
   %floor_range = "iree_unregistered.test_fprange"(%floor) : (tensor<5x6xf32>) -> tensor<5x6xf32>
-  return %result_range, %max_range, %floor_range : tensor<5x6xf32>, tensor<5x6xf32>, tensor<5x6xf32>
+  util.return %result_range, %max_range, %floor_range : tensor<5x6xf32>, tensor<5x6xf32>, tensor<5x6xf32>
 }

diff --git a/compiler/src/iree/compiler/Dialect/VM/Conversion/StandardToVM/ConvertStandardToVM.cpp b/compiler/src/iree/compiler/Dialect/VM/Conversion/StandardToVM/ConvertStandardToVM.cpp
index 8345e92..9321d82 100644
--- a/compiler/src/iree/compiler/Dialect/VM/Conversion/StandardToVM/ConvertStandardToVM.cpp
+++ b/compiler/src/iree/compiler/Dialect/VM/Conversion/StandardToVM/ConvertStandardToVM.cpp

@@ -173,6 +173,7 @@
   constexpr const char *kRetainedAttributes[] = {
       "nosideeffects",
       "vm.fallback",
+      "vm.signature",
   };
   auto retainedAttributes = ArrayRef<const char *>(
       kRetainedAttributes,
@@ -283,7 +284,7 @@
     // (Slow) lookup of the target function, which may be an import that we need
     // to perform type conversion for.
     auto calleeOp = SymbolTable::lookupSymbolIn(rootOp, calleeName);
-    if (auto funcOp = dyn_cast_or_null<func::FuncOp>(calleeOp)) {
+    if (auto funcOp = dyn_cast_or_null<FunctionOpInterface>(calleeOp)) {
       if (funcOp.isExternal()) {
         // Import that may require conversion.
         // This case handles when funcs are declared after the call.

diff --git a/compiler/src/iree/compiler/Dialect/VM/Conversion/UtilToVM/ConvertStructuralOps.cpp b/compiler/src/iree/compiler/Dialect/VM/Conversion/UtilToVM/ConvertStructuralOps.cpp
index 5c0614e..87f0b56 100644
--- a/compiler/src/iree/compiler/Dialect/VM/Conversion/UtilToVM/ConvertStructuralOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/VM/Conversion/UtilToVM/ConvertStructuralOps.cpp

@@ -146,6 +146,7 @@
   constexpr const char *kRetainedAttributes[] = {
       "nosideeffects",
       "vm.fallback",
+      "vm.signature",
   };
   auto retainedAttributes = ArrayRef<const char *>(
       kRetainedAttributes,

diff --git a/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel b/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel
index efcb00b..2eebddd 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel
+++ b/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel

@@ -99,7 +99,6 @@
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:ControlFlowDialect",
         "@llvm-project//mlir:DialectUtils",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",

diff --git a/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt b/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt
index 11c4af8..63e91a3 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt
+++ b/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt

@@ -73,7 +73,6 @@
     MLIRArithDialect
     MLIRArithUtils
     MLIRControlFlowDialect
-    MLIRFuncDialect
     MLIRFunctionInterfaces
     MLIRIR
     MLIRLinalgDialect

diff --git a/compiler/src/iree/compiler/GlobalOptimization/ExpandTensorShapes.cpp b/compiler/src/iree/compiler/GlobalOptimization/ExpandTensorShapes.cpp
index 1549a37..d652ace 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/ExpandTensorShapes.cpp
+++ b/compiler/src/iree/compiler/GlobalOptimization/ExpandTensorShapes.cpp

@@ -17,7 +17,6 @@
 #include "llvm/ADT/BreadthFirstIterator.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -59,7 +58,8 @@
 // dynamic dims as globals as duplicates will get added and we'll need to rely
 // on global fusion to get rid of them. Note that this only expands globals and
 // does not yet update use sites - we just need the ops to reference.
-static ExpandedGlobalMap expandGlobalTensorDims(Operation *rootOp) {
+static ExpandedGlobalMap expandGlobalTensorDims(Operation *rootOp,
+                                                SymbolTable &symbolTable) {
   ExpandedGlobalMap expandedGlobals;
 
   // Gather all of the dynamically-shaped tensor globals in the root.
@@ -72,7 +72,6 @@
   }
 
   // Expand each global by adding one global per dynamic dim beside it.
-  SymbolTable symbolTable(rootOp);
   auto indexType = IndexType::get(rootOp->getContext());
   for (auto &it : expandedGlobals) {
     auto &global = it.second;
@@ -108,19 +107,23 @@
          llvm::any_of(op->getResultTypes(), isDynamicTensor);
 }
 
+static void expandType(Type type, SmallVectorImpl<Type> &newTypes) {
+  newTypes.push_back(type);
+  if (auto tensorType = llvm::dyn_cast<RankedTensorType>(type)) {
+    newTypes.append(tensorType.getNumDynamicDims(),
+                    IndexType::get(type.getContext()));
+  }
+}
+
 // Expands tensors in the given |types| list to (tensor, dynamic dims...).
 // This could be changed to some iterator magic to avoid the alloc.
 static SmallVector<Type> expandTypes(TypeRange types) {
   if (types.empty())
     return {};
-  auto indexType = IndexType::get(types.front().getContext());
   SmallVector<Type> newTypes;
   newTypes.reserve(types.size() * 2);
   for (auto type : types) {
-    newTypes.push_back(type);
-    if (auto tensorType = llvm::dyn_cast<RankedTensorType>(type)) {
-      newTypes.append(tensorType.getNumDynamicDims(), indexType);
-    }
+    expandType(type, newTypes);
   }
   return newTypes;
 }
@@ -163,6 +166,20 @@
   return expandedValue;
 }
 
+static void expandOperand(Location loc, Value operand,
+                          SmallVectorImpl<Value> &newOperands,
+                          TensorDimMap &tensorDimMap, IndexSet &indexSet,
+                          OpBuilder &builder) {
+  if (isDynamicTensor(operand.getType())) {
+    auto expandedValue =
+        consumeExpandedValue(loc, operand, tensorDimMap, indexSet, builder);
+    newOperands.push_back(expandedValue.tensor);
+    newOperands.append(expandedValue.dynamicDims);
+  } else {
+    newOperands.push_back(operand);
+  }
+}
+
 // Expands tensor in |operands| into (tensor, dynamic dims...) tuples.
 static SmallVector<Value> expandOperands(Location loc, ValueRange operands,
                                          TensorDimMap &tensorDimMap,
@@ -171,25 +188,20 @@
   SmallVector<Value> result;
   result.reserve(operands.size() * 2);
   for (auto operand : operands) {
-    if (isDynamicTensor(operand.getType())) {
-      auto expandedValue =
-          consumeExpandedValue(loc, operand, tensorDimMap, indexSet, builder);
-      result.push_back(expandedValue.tensor);
-      result.append(expandedValue.dynamicDims);
-    } else {
-      result.push_back(operand);
-    }
+    expandOperand(loc, operand, result, tensorDimMap, indexSet, builder);
   }
   return result;
 }
 
-static void expandTensorDims(Operation *op, ExpandedGlobalMap &globalMap,
-                             IndexSet &indexSet, TensorDimMap &tensorDimMap);
+static void expandTensorDims(Operation *op, SymbolTable &symbolTable,
+                             ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                             TensorDimMap &tensorDimMap);
 
 // Recursively expands tensors into (tensor, dynamic dims...) tuples within the
 // given |region|. All branches, ops, and nested regions will be processed.
-static void expandRegion(Region &region, ExpandedGlobalMap &globalMap,
-                         IndexSet &indexSet, TensorDimMap tensorDimMap) {
+static void expandRegion(Region &region, SymbolTable &symbolTable,
+                         ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                         TensorDimMap tensorDimMap) {
   if (region.empty())
     return;
 
@@ -232,14 +244,14 @@
   if (region.hasOneBlock()) {
     for (auto &op :
          llvm::make_early_inc_range(region.front().getOperations())) {
-      expandTensorDims(&op, globalMap, indexSet, tensorDimMap);
+      expandTensorDims(&op, symbolTable, globalMap, indexSet, tensorDimMap);
     }
   } else {
     DominanceInfo domInfo(region.getParentOp());
     for (auto *blockInfo : llvm::breadth_first(domInfo.getRootNode(&region))) {
       auto *block = blockInfo->getBlock();
       for (auto &op : llvm::make_early_inc_range(block->getOperations())) {
-        expandTensorDims(&op, globalMap, indexSet, tensorDimMap);
+        expandTensorDims(&op, symbolTable, globalMap, indexSet, tensorDimMap);
       }
     }
   }
@@ -337,30 +349,36 @@
 }
 
 static void expandInitializerOp(IREE::Util::InitializerOp op,
+                                SymbolTable &symbolTable,
                                 ExpandedGlobalMap &globalMap,
                                 IndexSet &indexSet,
                                 TensorDimMap &tensorDimMap) {
-  expandRegion(op.getRegion(), globalMap, indexSet, tensorDimMap);
+  expandRegion(op.getRegion(), symbolTable, globalMap, indexSet, tensorDimMap);
 }
 
 // Inserts dimension associate reshapes on tensor arguments.
 // Requires that the ExpandCallOp/ExpandReturnOp patterns handle passing dims.
 //
 // Example:
-//  func.func @foo(%0: tensor<?xf32>)
+//  util.func @foo(%0: tensor<?xf32>)
 //  ->
-//  func.func @foo(%0: tensor<?xf32>, %d: index) {
+//  util.func @foo(%0: tensor<?xf32>, %d: index) {
 //    %1 = flow.tensor.tie_shape %0 : tensor<?xf32>{%d}
-static void expandFuncOp(mlir::func::FuncOp op, ExpandedGlobalMap &globalMap,
-                         IndexSet &indexSet, TensorDimMap &tensorDimMap) {
-  auto oldType = op.getFunctionType();
-  auto inputTypes = expandTypes(oldType.getInputs());
-  auto resultTypes = expandTypes(oldType.getResults());
-  auto newType = FunctionType::get(op.getContext(), inputTypes, resultTypes);
-  if (newType != oldType) {
-    op.setType(newType);
+static void expandFuncOp(IREE::Util::FuncOp op, SymbolTable &symbolTable,
+                         ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                         TensorDimMap &tensorDimMap) {
+  // Ignore public/external function signatures but still convert regions.
+  bool canModifyEntryBlock = !IREE::Util::isPublicOrExternal(op);
+  if (canModifyEntryBlock) {
+    op.expandSignature(
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          expandType(type, newTypes);
+        },
+        [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+          expandType(type, newTypes);
+        });
   }
-  expandRegion(op.getRegion(), globalMap, indexSet, tensorDimMap);
+  expandRegion(op.getRegion(), symbolTable, globalMap, indexSet, tensorDimMap);
 }
 
 // Splits tensor operands and results into (tensor, dynamic dims...).
@@ -368,22 +386,31 @@
 //
 // Example:
 //  %a = flow.tensor.tie_shape %0 : tensor<?xf32>{%d}
-//  %r = call @foo(%a)
+//  %r = util.call @foo(%a)
 //  ->
-//  %r, %rd = call @foo(%a, %ad)
+//  %r, %rd = util.call @foo(%a, %ad)
 //  %2 = flow.tensor.tie_shape %r : tensor<?xf32>{%rd}
-static void expandCallOp(mlir::func::CallOp op, IndexSet &indexSet,
-                         TensorDimMap &tensorDimMap) {
+static void expandCallOp(IREE::Util::CallOp op, SymbolTable &symbolTable,
+                         IndexSet &indexSet, TensorDimMap &tensorDimMap) {
   if (!usesDynamicTensors(op))
     return;
 
+  // Ignore calls to public/external functions.
+  auto calleeOp = symbolTable.lookup<CallableOpInterface>(op.getCallee());
+  if (IREE::Util::isPublicOrExternal(calleeOp))
+    return;
+
   // Build the new call op with expanded operands and results.
   OpBuilder builder(op);
-  auto operands = expandOperands(op.getLoc(), op.getOperands(), tensorDimMap,
-                                 indexSet, builder);
-  auto resultTypes = expandTypes(op.getResultTypes());
-  auto newOp = builder.create<mlir::func::CallOp>(op.getLoc(), op.getCallee(),
-                                                  resultTypes, operands);
+  auto newOp = op.cloneAndExpand(
+      [&](unsigned i, Value operand, SmallVectorImpl<Value> &newOperands) {
+        expandOperand(op.getLoc(), operand, newOperands, tensorDimMap, indexSet,
+                      builder);
+      },
+      [&](unsigned i, Type type, SmallVectorImpl<Type> &newTypes) {
+        expandType(type, newTypes);
+      },
+      builder);
 
   retieResults(op, newOp, tensorDimMap);
   op.erase();
@@ -394,17 +421,19 @@
 //
 // Example:
 //  %1 = flow.tensor.tie_shape %0 : tensor<?xf32>{%d}
-//  return %1
+//  util.return %1
 //  ->
-//  return %0, %d
-static void expandReturnOp(mlir::func::ReturnOp op, IndexSet &indexSet,
+//  util.return %0, %d
+static void expandReturnOp(IREE::Util::ReturnOp op, IndexSet &indexSet,
                            TensorDimMap &tensorDimMap) {
   if (!usesDynamicTensors(op))
     return;
+  if (IREE::Util::isPublicOrExternal(op->getParentOfType<IREE::Util::FuncOp>()))
+    return;
   OpBuilder builder(op);
   auto operands = expandOperands(op.getLoc(), op.getOperands(), tensorDimMap,
                                  indexSet, builder);
-  builder.create<mlir::func::ReturnOp>(op.getLoc(), operands);
+  builder.create<IREE::Util::ReturnOp>(op.getLoc(), operands);
   op.erase();
 }
 
@@ -484,8 +513,9 @@
   op.erase();
 }
 
-static void expandWhileOp(mlir::scf::WhileOp op, ExpandedGlobalMap &globalMap,
-                          IndexSet &indexSet, TensorDimMap &tensorDimMap) {
+static void expandWhileOp(mlir::scf::WhileOp op, SymbolTable &symbolTable,
+                          ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                          TensorDimMap &tensorDimMap) {
   OpBuilder builder(op);
   auto operands = expandOperands(op.getLoc(), op.getOperands(), tensorDimMap,
                                  indexSet, builder);
@@ -498,14 +528,17 @@
   newOp.getBefore().takeBody(op.getBefore());
   newOp.getAfter().takeBody(op.getAfter());
 
-  expandRegion(newOp.getBefore(), globalMap, indexSet, tensorDimMap);
-  expandRegion(newOp.getAfter(), globalMap, indexSet, tensorDimMap);
+  expandRegion(newOp.getBefore(), symbolTable, globalMap, indexSet,
+               tensorDimMap);
+  expandRegion(newOp.getAfter(), symbolTable, globalMap, indexSet,
+               tensorDimMap);
   retieResults(op, newOp, tensorDimMap);
   op.erase();
 }
 
-static void expandIfOp(mlir::scf::IfOp op, ExpandedGlobalMap &globalMap,
-                       IndexSet &indexSet, TensorDimMap &tensorDimMap) {
+static void expandIfOp(mlir::scf::IfOp op, SymbolTable &symbolTable,
+                       ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                       TensorDimMap &tensorDimMap) {
   OpBuilder builder(op);
   auto resultTypes = expandTypes(op.getResultTypes());
 
@@ -513,11 +546,12 @@
       op.getLoc(), resultTypes, op.getOperand(), op.elseBlock() != nullptr);
 
   newOp.getBodyRegion().takeBody(op.getBodyRegion());
-  expandRegion(newOp.getBodyRegion(), globalMap, indexSet, tensorDimMap);
-
+  expandRegion(newOp.getBodyRegion(), symbolTable, globalMap, indexSet,
+               tensorDimMap);
   if (newOp.elseBlock()) {
     newOp.getElseRegion().takeBody(op.getElseRegion());
-    expandRegion(newOp.getElseRegion(), globalMap, indexSet, tensorDimMap);
+    expandRegion(newOp.getElseRegion(), symbolTable, globalMap, indexSet,
+                 tensorDimMap);
   }
 
   retieResults(op, newOp, tensorDimMap);
@@ -544,19 +578,21 @@
 }
 
 // Recursively expands tensors into (tensor, dynamic dims...) in |op|.
-static void expandTensorDims(Operation *op, ExpandedGlobalMap &globalMap,
-                             IndexSet &indexSet, TensorDimMap &tensorDimMap) {
+static void expandTensorDims(Operation *op, SymbolTable &symbolTable,
+                             ExpandedGlobalMap &globalMap, IndexSet &indexSet,
+                             TensorDimMap &tensorDimMap) {
   if (auto loadOp = dyn_cast<IREE::Util::GlobalLoadOpInterface>(op)) {
     expandGlobalLoadOp(loadOp, globalMap, indexSet, tensorDimMap);
   } else if (auto storeOp = dyn_cast<IREE::Util::GlobalStoreOpInterface>(op)) {
     expandGlobalStoreOp(storeOp, globalMap, indexSet, tensorDimMap);
   } else if (auto initializerOp = dyn_cast<IREE::Util::InitializerOp>(op)) {
-    expandInitializerOp(initializerOp, globalMap, indexSet, tensorDimMap);
-  } else if (auto funcOp = dyn_cast<mlir::func::FuncOp>(op)) {
-    expandFuncOp(funcOp, globalMap, indexSet, tensorDimMap);
-  } else if (auto callOp = dyn_cast<mlir::func::CallOp>(op)) {
-    expandCallOp(callOp, indexSet, tensorDimMap);
-  } else if (auto returnOp = dyn_cast<mlir::func::ReturnOp>(op)) {
+    expandInitializerOp(initializerOp, symbolTable, globalMap, indexSet,
+                        tensorDimMap);
+  } else if (auto funcOp = dyn_cast<IREE::Util::FuncOp>(op)) {
+    expandFuncOp(funcOp, symbolTable, globalMap, indexSet, tensorDimMap);
+  } else if (auto callOp = dyn_cast<IREE::Util::CallOp>(op)) {
+    expandCallOp(callOp, symbolTable, indexSet, tensorDimMap);
+  } else if (auto returnOp = dyn_cast<IREE::Util::ReturnOp>(op)) {
     expandReturnOp(returnOp, indexSet, tensorDimMap);
   } else if (auto branchOp = dyn_cast<mlir::cf::BranchOp>(op)) {
     expandBranchOp(branchOp, indexSet, tensorDimMap);
@@ -565,9 +601,9 @@
   } else if (auto selectOp = dyn_cast<mlir::arith::SelectOp>(op)) {
     expandSelectOp(selectOp, indexSet, tensorDimMap);
   } else if (auto whileOp = dyn_cast<mlir::scf::WhileOp>(op)) {
-    expandWhileOp(whileOp, globalMap, indexSet, tensorDimMap);
+    expandWhileOp(whileOp, symbolTable, globalMap, indexSet, tensorDimMap);
   } else if (auto ifOp = dyn_cast<mlir::scf::IfOp>(op)) {
-    expandIfOp(ifOp, globalMap, indexSet, tensorDimMap);
+    expandIfOp(ifOp, symbolTable, globalMap, indexSet, tensorDimMap);
   } else if (auto yieldOp = dyn_cast<mlir::scf::YieldOp>(op)) {
     expandScfYieldOp(yieldOp, indexSet, tensorDimMap);
   } else if (auto conditionOp = dyn_cast<mlir::scf::ConditionOp>(op)) {
@@ -593,7 +629,6 @@
   ExpandTensorShapesPass() = default;
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::func::FuncDialect>();
     registry.insert<mlir::arith::ArithDialect>();
     registry.insert<IREE::Flow::FlowDialect>();
     registry.insert<IREE::Util::UtilDialect>();
@@ -601,9 +636,10 @@
 
   void runOnOperation() override {
     auto rootOp = getOperation();
+    SymbolTable symbolTable(rootOp);
 
     // Expand all util.global ops holding tensor into tensor + dynamic dims.
-    auto globalMap = expandGlobalTensorDims(rootOp);
+    auto globalMap = expandGlobalTensorDims(rootOp, symbolTable);
 
     // Walk the entire IR tree and expand the globals.
     // We could do this via pattern application but that gets much trickier to
@@ -618,7 +654,8 @@
                             ? OpBuilder(callableOp)
                             : OpBuilder::atBlockBegin(&region->front()));
       TensorDimMap tensorDimMap;
-      expandTensorDims(callableOp, globalMap, indexSet, tensorDimMap);
+      expandTensorDims(callableOp, symbolTable, globalMap, indexSet,
+                       tensorDimMap);
     }
   }
 };

diff --git a/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp b/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp
index 63c1a3a..bf62869 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp
+++ b/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp

@@ -13,7 +13,6 @@
 #include "iree/compiler/Utils/PassUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -24,7 +23,7 @@
 namespace mlir::iree_compiler::GlobalOptimization {
 
 using FunctionLikeNest =
-    MultiOpNest<func::FuncOp, IREE::Util::InitializerOp, IREE::Util::FuncOp>;
+    MultiOpNest<IREE::Util::InitializerOp, IREE::Util::FuncOp>;
 
 class MaterializeHomogeneousEncodingsPass
     : public MaterializeHomogeneousEncodingsBase<

diff --git a/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp b/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp
index e53f7cd..ccf47c5 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp
+++ b/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp

@@ -9,7 +9,6 @@
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "iree/compiler/Modules/IO/Parameters/Transforms/Passes.h"
 #include "iree/compiler/Utils/PassUtils.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Transforms/Passes.h"
@@ -17,7 +16,7 @@
 namespace mlir::iree_compiler::GlobalOptimization {
 
 using FunctionLikeNest =
-    MultiOpNest<func::FuncOp, IREE::Util::InitializerOp, IREE::Util::FuncOp>;
+    MultiOpNest<IREE::Util::InitializerOp, IREE::Util::FuncOp>;
 
 static llvm::cl::opt<bool> clEnableQuantizedMatmulReassociation(
     "iree-global-opt-enable-quantized-matmul-reassociation",

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/cleanup_numeric_narrowing.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/cleanup_numeric_narrowing.mlir
index 27a59fb..963202b 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/cleanup_numeric_narrowing.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/cleanup_numeric_narrowing.mlir

@@ -1,8 +1,8 @@
 // RUN: iree-opt --iree-global-opt-cleanup-numeric-narrowing %s | FileCheck %s
 
 // CHECK-LABEL: @remove_inferences
-func.func @remove_inferences(%arg0 : tensor<5x3xf32>) -> tensor<5x3xf32> {
+util.func public @remove_inferences(%arg0 : tensor<5x3xf32>) -> tensor<5x3xf32> {
   %0 = util.numeric.optional_narrow %arg0 : tensor<5x3xf32> as ui3 {max_value = 5 : ui3, min_value = 5 : ui3}
-  // CHECK: return %arg0
-  return %0 : tensor<5x3xf32>
+  // CHECK: util.return %arg0
+  util.return %0 : tensor<5x3xf32>
 }

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/conv1x1_to_matmul.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/conv1x1_to_matmul.mlir
index 18df3dd..ad1cdbd 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/conv1x1_to_matmul.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/conv1x1_to_matmul.mlir

@@ -1,12 +1,12 @@
 // RUN: iree-opt --split-input-file -iree-global-opt-convert-1x1-filter-conv2d-to-matmul %s | FileCheck %s
 
-func.func @nhwc_conv_2d(%input: tensor<1x4x5x2xf32>, %filter: tensor<1x1x2x7xf32>) -> tensor<1x4x5x7xf32> {
+util.func public @nhwc_conv_2d(%input: tensor<1x4x5x2xf32>, %filter: tensor<1x1x2x7xf32>) -> tensor<1x4x5x7xf32> {
     %0 = tensor.empty() : tensor<1x4x5x7xf32>
     %1 = linalg.conv_2d_nhwc_hwcf {
         dilations = dense<1> : tensor<2xi64>,
         strides = dense<1> : tensor<2xi64>
     } ins(%input, %filter : tensor<1x4x5x2xf32>, tensor<1x1x2x7xf32>) outs(%0 : tensor<1x4x5x7xf32>) -> tensor<1x4x5x7xf32>
-    return %1 : tensor<1x4x5x7xf32>
+    util.return %1 : tensor<1x4x5x7xf32>
 }
 
 // CHECK: @nhwc_conv_2d
@@ -18,12 +18,12 @@
 // CHECK: %[[RESHAPED_OUTPUT:.+]] = tensor.collapse_shape %[[OUTPUT]] {{\[}}[0, 1, 2], [3]] : tensor<1x4x5x7xf32> into tensor<20x7xf32>
 // CHECK: %[[MATMUL_RESULT:.+]] = linalg.matmul ins(%[[RESHAPED_INPUT]], %[[RESHAPED_FILTER]] : tensor<20x2xf32>, tensor<2x7xf32>) outs(%[[RESHAPED_OUTPUT]] : tensor<20x7xf32>)
 // CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0, 1, 2], [3]] : tensor<20x7xf32> into tensor<1x4x5x7xf32>
-// CHECK: return %[[RESULT]]
+// CHECK: util.return %[[RESULT]]
 
 // -----
 
 // CHECK: @dynamic_nhwc_conv_2d
-func.func @dynamic_nhwc_conv_2d(%input: tensor<1x4x?x2xf32>, %filter: tensor<1x1x2x7xf32>) -> tensor<1x4x?x7xf32> {
+util.func public @dynamic_nhwc_conv_2d(%input: tensor<1x4x?x2xf32>, %filter: tensor<1x1x2x7xf32>) -> tensor<1x4x?x7xf32> {
     %c2 = arith.constant 2 : index
     %d2 = tensor.dim %input, %c2 : tensor<1x4x?x2xf32>
     %0 = tensor.empty(%d2) : tensor<1x4x?x7xf32>
@@ -31,7 +31,7 @@
         dilations = dense<1> : tensor<2xi64>,
         strides = dense<1> : tensor<2xi64>
     } ins(%input, %filter : tensor<1x4x?x2xf32>, tensor<1x1x2x7xf32>) outs(%0 : tensor<1x4x?x7xf32>) -> tensor<1x4x?x7xf32>
-    return %1 : tensor<1x4x?x7xf32>
+    util.return %1 : tensor<1x4x?x7xf32>
 }
 
 // CHECK: %[[INPUT:.+]]: tensor<1x4x?x2xf32>
@@ -47,7 +47,7 @@
 
 // -----
 
-func.func @fail_dynamic_nhwc_conv_2d(%input: tensor<1x?x?x2xf32>, %filter: tensor<1x1x2x7xf32>) -> tensor<1x?x?x7xf32> {
+util.func public @fail_dynamic_nhwc_conv_2d(%input: tensor<1x?x?x2xf32>, %filter: tensor<1x1x2x7xf32>) -> tensor<1x?x?x7xf32> {
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
     %d1 = tensor.dim %input, %c1 : tensor<1x?x?x2xf32>
@@ -57,7 +57,7 @@
         dilations = dense<1> : tensor<2xi64>,
         strides = dense<1> : tensor<2xi64>
     } ins(%input, %filter : tensor<1x?x?x2xf32>, tensor<1x1x2x7xf32>) outs(%0 : tensor<1x?x?x7xf32>) -> tensor<1x?x?x7xf32>
-    return %1 : tensor<1x?x?x7xf32>
+    util.return %1 : tensor<1x?x?x7xf32>
 }
 
 // CHECK: @fail_dynamic_nhwc_conv_2d
@@ -65,13 +65,13 @@
 
 // -----
 
-func.func @nchw_conv_2d(%input: tensor<1x2x4x5xf32>, %filter: tensor<7x2x1x1xf32>) -> tensor<1x7x4x5xf32> {
+util.func public @nchw_conv_2d(%input: tensor<1x2x4x5xf32>, %filter: tensor<7x2x1x1xf32>) -> tensor<1x7x4x5xf32> {
     %0 = tensor.empty() : tensor<1x7x4x5xf32>
     %1 = linalg.conv_2d_nchw_fchw {
         dilations = dense<1> : tensor<2xi64>,
         strides = dense<1> : tensor<2xi64>
     } ins(%input, %filter : tensor<1x2x4x5xf32>, tensor<7x2x1x1xf32>) outs(%0 : tensor<1x7x4x5xf32>) -> tensor<1x7x4x5xf32>
-    return %1 : tensor<1x7x4x5xf32>
+    util.return %1 : tensor<1x7x4x5xf32>
 }
 // CHECK: @nchw_conv_2d
 // CHECK: %[[INPUT:.+]]: tensor<1x2x4x5xf32>
@@ -82,11 +82,11 @@
 // CHECK: %[[RESHAPED_OUTPUT:.+]] = tensor.collapse_shape %[[OUTPUT]] {{\[}}[0, 1], [2, 3]] : tensor<1x7x4x5xf32> into tensor<7x20xf32>
 // CHECK: %[[MATMUL_RESULT:.+]] = linalg.matmul ins(%[[RESHAPED_FILTER]], %[[RESHAPED_INPUT]] : tensor<7x2xf32>, tensor<2x20xf32>) outs(%[[RESHAPED_OUTPUT]] : tensor<7x20xf32>)
 // CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0, 1], [2, 3]] : tensor<7x20xf32> into tensor<1x7x4x5xf32>
-// CHECK: return %[[RESULT]]
+// CHECK: util.return %[[RESULT]]
 
 // -----
 
-func.func @dynamic_nchw_conv_2d(%input: tensor<1x2x4x?xf32>, %filter: tensor<7x2x1x1xf32>) -> tensor<1x7x4x?xf32> {
+util.func public @dynamic_nchw_conv_2d(%input: tensor<1x2x4x?xf32>, %filter: tensor<7x2x1x1xf32>) -> tensor<1x7x4x?xf32> {
     %c3 = arith.constant 3 : index
     %d3 = tensor.dim %input, %c3 : tensor<1x2x4x?xf32>
     %0 = tensor.empty(%d3) : tensor<1x7x4x?xf32>
@@ -94,7 +94,7 @@
         dilations = dense<1> : tensor<2xi64>,
         strides = dense<1> : tensor<2xi64>
     } ins(%input, %filter : tensor<1x2x4x?xf32>, tensor<7x2x1x1xf32>) outs(%0 : tensor<1x7x4x?xf32>) -> tensor<1x7x4x?xf32>
-    return %1 : tensor<1x7x4x?xf32>
+    util.return %1 : tensor<1x7x4x?xf32>
 }
 
 // CHECK: @dynamic_nchw_conv_2d
@@ -108,11 +108,11 @@
 // CHECK: %[[RESHAPED_OUTPUT:.+]] = tensor.collapse_shape %[[OUTPUT]] {{\[}}[0, 1], [2, 3]] : tensor<1x7x4x?xf32> into tensor<7x?xf32>
 // CHECK: %[[MATMUL_RESULT:.+]] = linalg.matmul ins(%[[RESHAPED_FILTER]], %[[RESHAPED_INPUT]] : tensor<7x2xf32>, tensor<2x?xf32>) outs(%[[RESHAPED_OUTPUT]] : tensor<7x?xf32>)
 // CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[MATMUL_RESULT]] {{\[}}[0, 1], [2, 3]] : tensor<7x?xf32> into tensor<1x7x4x?xf32>
-// CHECK: return %[[RESULT]]
+// CHECK: util.return %[[RESULT]]
 
 // -----
 
-func.func @fail_dynamic_nchw_conv_2d(%input: tensor<1x2x?x?xf32>, %filter: tensor<7x2x1x1xf32>) -> tensor<1x7x?x?xf32> {
+util.func public @fail_dynamic_nchw_conv_2d(%input: tensor<1x2x?x?xf32>, %filter: tensor<7x2x1x1xf32>) -> tensor<1x7x?x?xf32> {
     %c2 = arith.constant 2 : index
     %c3 = arith.constant 3 : index
     %d2 = tensor.dim %input, %c2 : tensor<1x2x?x?xf32>
@@ -122,7 +122,7 @@
         dilations = dense<1> : tensor<2xi64>,
         strides = dense<1> : tensor<2xi64>
     } ins(%input, %filter : tensor<1x2x?x?xf32>, tensor<7x2x1x1xf32>) outs(%0 : tensor<1x7x?x?xf32>) -> tensor<1x7x?x?xf32>
-    return %1 : tensor<1x7x?x?xf32>
+    util.return %1 : tensor<1x7x?x?xf32>
 }
 
 // CHECK: @fail_dynamic_nchw_conv_2d

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/demote_contraction_inputs_to_bf16.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/demote_contraction_inputs_to_bf16.mlir
index 66b9067..ff717b8 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/demote_contraction_inputs_to_bf16.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/demote_contraction_inputs_to_bf16.mlir

@@ -1,10 +1,10 @@
 // RUN: iree-opt --split-input-file -iree-global-opt-demote-contraction-inputs-to-bf16 %s | FileCheck %s
 
-func.func @matmul_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250x500xf32>,
+util.func public @matmul_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250x500xf32>,
     %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<100x250xf32>, tensor<250x500xf32>)
       outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32>
-  return %0 : tensor<100x500xf32>
+  util.return %0 : tensor<100x500xf32>
 }
 
 // CHECK: @matmul_f32f32f32
@@ -23,11 +23,11 @@
 
 // -----
 
-func.func @dynamic_matmul_f32f32f32(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @dynamic_matmul_f32f32f32(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
 
 // CHECK: @dynamic_matmul_f32f32f32
@@ -44,11 +44,11 @@
 
 // -----
 
-func.func @batch_matmul_f32f32f32(%arg0 : tensor<4x100x250xf32>, %arg1 : tensor<4x250x500xf32>,
+util.func public @batch_matmul_f32f32f32(%arg0 : tensor<4x100x250xf32>, %arg1 : tensor<4x250x500xf32>,
     %arg2 : tensor<4x100x500xf32>) -> tensor<4x100x500xf32> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<4x100x250xf32>, tensor<4x250x500xf32>)
       outs(%arg2 : tensor<4x100x500xf32>) -> tensor<4x100x500xf32>
-  return %0 : tensor<4x100x500xf32>
+  util.return %0 : tensor<4x100x500xf32>
 }
 
 // CHECK: @batch_matmul_f32f32f32
@@ -67,11 +67,11 @@
 
 // -----
 
-func.func @matvec_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250xf32>,
+util.func public @matvec_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250xf32>,
     %arg2 : tensor<100xf32>) -> tensor<100xf32> {
   %0 = linalg.matvec ins(%arg0, %arg1 : tensor<100x250xf32>, tensor<250xf32>)
       outs(%arg2 : tensor<100xf32>) -> tensor<100xf32>
-  return %0 : tensor<100xf32>
+  util.return %0 : tensor<100xf32>
 }
 
 // CHECK: @matvec_f32f32f32
@@ -90,11 +90,11 @@
 
 // -----
 
-func.func @batch_vecmat_f32f32f32(%arg0 : tensor<4x250xf32>, %arg1 : tensor<4x250x500xf32>,
+util.func public @batch_vecmat_f32f32f32(%arg0 : tensor<4x250xf32>, %arg1 : tensor<4x250x500xf32>,
     %arg2 : tensor<4x500xf32>) -> tensor<4x500xf32> {
   %0 = linalg.batch_vecmat ins(%arg0, %arg1 : tensor<4x250xf32>, tensor<4x250x500xf32>)
       outs(%arg2 : tensor<4x500xf32>) -> tensor<4x500xf32>
-  return %0 : tensor<4x500xf32>
+  util.return %0 : tensor<4x500xf32>
 }
 
 // CHECK: @batch_vecmat_f32f32f32
@@ -113,11 +113,11 @@
 
 // -----
 
-func.func @nonmatch_matmul_f32f32f64(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250x500xf32>,
+util.func public @nonmatch_matmul_f32f32f64(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250x500xf32>,
     %arg2 : tensor<100x500xf64>) -> tensor<100x500xf64> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<100x250xf32>, tensor<250x500xf32>)
       outs(%arg2 : tensor<100x500xf64>) -> tensor<100x500xf64>
-  return %0 : tensor<100x500xf64>
+  util.return %0 : tensor<100x500xf64>
 }
 
 // CHECK: @nonmatch_matmul_f32f32f64
@@ -130,11 +130,11 @@
 
 // -----
 
-func.func @batch_matmul_transpose_a_f32f32f32(%arg0 : tensor<4x250x100xf32>, %arg1 : tensor<4x250x500xf32>,
+util.func public @batch_matmul_transpose_a_f32f32f32(%arg0 : tensor<4x250x100xf32>, %arg1 : tensor<4x250x500xf32>,
     %arg2 : tensor<4x100x500xf32>) -> tensor<4x100x500xf32> {
   %0 = linalg.batch_matmul_transpose_a ins(%arg0, %arg1 : tensor<4x250x100xf32>, tensor<4x250x500xf32>)
       outs(%arg2 : tensor<4x100x500xf32>) -> tensor<4x100x500xf32>
-  return %0 : tensor<4x100x500xf32>
+  util.return %0 : tensor<4x100x500xf32>
 }
 
 // CHECK: @batch_matmul_transpose_a_f32f32f32
@@ -153,11 +153,11 @@
 
 // -----
 
-func.func @batch_matmul_transpose_b_f32f32f32(%arg0 : tensor<4x100x250xf32>, %arg1 : tensor<4x500x250xf32>,
+util.func public @batch_matmul_transpose_b_f32f32f32(%arg0 : tensor<4x100x250xf32>, %arg1 : tensor<4x500x250xf32>,
     %arg2 : tensor<4x100x500xf32>) -> tensor<4x100x500xf32> {
   %0 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : tensor<4x100x250xf32>, tensor<4x500x250xf32>)
       outs(%arg2 : tensor<4x100x500xf32>) -> tensor<4x100x500xf32>
-  return %0 : tensor<4x100x500xf32>
+  util.return %0 : tensor<4x100x500xf32>
 }
 
 // CHECK: @batch_matmul_transpose_b_f32f32f32
@@ -176,11 +176,11 @@
 
 // -----
 
-func.func @matmul_transpose_a_f32f32f32(%arg0 : tensor<250x100xf32>, %arg1 : tensor<250x500xf32>,
+util.func public @matmul_transpose_a_f32f32f32(%arg0 : tensor<250x100xf32>, %arg1 : tensor<250x500xf32>,
     %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> {
   %0 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<250x100xf32>, tensor<250x500xf32>)
       outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32>
-  return %0 : tensor<100x500xf32>
+  util.return %0 : tensor<100x500xf32>
 }
 
 // CHECK: @matmul_transpose_a_f32f32f32
@@ -199,11 +199,11 @@
 
 // -----
 
-func.func @matmul_transpose_b_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<500x250xf32>,
+util.func public @matmul_transpose_b_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<500x250xf32>,
     %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> {
   %0 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<100x250xf32>, tensor<500x250xf32>)
       outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32>
-  return %0 : tensor<100x500xf32>
+  util.return %0 : tensor<100x500xf32>
 }
 
 // CHECK: @matmul_transpose_b_f32f32f32

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/detach_elementwise_from_named_ops.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/detach_elementwise_from_named_ops.mlir
index 162acf9..cec787f 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/detach_elementwise_from_named_ops.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/detach_elementwise_from_named_ops.mlir

@@ -1,6 +1,6 @@
 // RUN: iree-opt --split-input-file --iree-global-opt-detach-elementwise-from-named-ops --mlir-print-local-scope %s | FileCheck %s
 
-func.func @matmul(%a: tensor<?x64xf32>, %b: tensor<64x?xf32>, %c: tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @matmul(%a: tensor<?x64xf32>, %b: tensor<64x?xf32>, %c: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
       iterator_types = ["parallel", "parallel"]}
@@ -10,10 +10,10 @@
       linalg.yield %1 : f32
     } -> tensor<?x?xf32>
   %1 = linalg.matmul ins(%a, %b : tensor<?x64xf32>, tensor<64x?xf32>) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: func @matmul
+// CHECK-LABEL: util.func public @matmul
 //  CHECK-SAME: (%[[A:.+]]: tensor<?x64xf32>, %[[B:.+]]: tensor<64x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>)
 
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -36,11 +36,11 @@
 //       CHECK:   ^{{.+}}(%[[ARG0:.+]]: f32, %[[ARG1:.+]]: f32, %{{.+}}: f32):
 //       CHECK:     %[[ADD:.+]] = arith.addf %[[ARG0]], %[[ARG1]] : f32
 //       CHECK:     linalg.yield %[[ADD]] : f32
-//       CHECK:   return %[[EW]]
+//       CHECK:   util.return %[[EW]]
 
 // -----
 
-func.func @batch_matmul(%a: tensor<?x8x?xi32>, %b: tensor<?x?x16xi32>, %c: tensor<?x8x16xi32>) -> tensor<?x8x16xi32> {
+util.func public @batch_matmul(%a: tensor<?x8x?xi32>, %b: tensor<?x?x16xi32>, %c: tensor<?x8x16xi32>) -> tensor<?x8x16xi32> {
   %0 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
       iterator_types = ["parallel", "parallel", "parallel"]}
@@ -50,10 +50,10 @@
       linalg.yield %1 : i32
     } -> tensor<?x8x16xi32>
   %1 = linalg.batch_matmul ins(%a, %b : tensor<?x8x?xi32>, tensor<?x?x16xi32>) outs(%0 : tensor<?x8x16xi32>) -> tensor<?x8x16xi32>
-  return %1 : tensor<?x8x16xi32>
+  util.return %1 : tensor<?x8x16xi32>
 }
 
-// CHECK-LABEL: func @batch_matmul
+// CHECK-LABEL: util.func public @batch_matmul
 //  CHECK-SAME: (%[[A:.+]]: tensor<?x8x?xi32>, %[[B:.+]]: tensor<?x?x16xi32>, %[[ARG2:.+]]: tensor<?x8x16xi32>)
 
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
@@ -71,11 +71,11 @@
 //  CHECK-SAME:     outs(%[[FILL]] : tensor<?x8x16xi32>)
 //       CHECK:     %[[ADD:.+]] = arith.addi
 //       CHECK:     linalg.yield %[[ADD]] : i32
-//       CHECK:   return %[[EW]]
+//       CHECK:   util.return %[[EW]]
 
 // -----
 
-func.func @conv(%input: tensor<1x225x225x3xf32>, %filter: tensor<3x3x3x32xf32>, %init: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
+util.func public @conv(%input: tensor<1x225x225x3xf32>, %filter: tensor<3x3x3x32xf32>, %init: tensor<32xf32>) -> tensor<1x112x112x32xf32> {
   %init0 = tensor.empty() : tensor<1x112x112x32xf32>
   %0 = linalg.generic {
       indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
@@ -86,10 +86,10 @@
     } -> tensor<1x112x112x32xf32>
   %1 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
     ins(%input, %filter : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%0 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
-  return %1 : tensor<1x112x112x32xf32>
+  util.return %1 : tensor<1x112x112x32xf32>
 }
 
-// CHECK-LABEL: func @conv
+// CHECK-LABEL: util.func public @conv
 //  CHECK-SAME: (%{{.+}}: tensor<1x225x225x3xf32>, %{{.+}}: tensor<3x3x3x32xf32>, %[[BIAS:.+]]: tensor<32xf32>)
 //       CHECK:   %[[INIT:.+]] = linalg.generic
 //  CHECK-SAME:     ins(%[[BIAS]] :
@@ -101,7 +101,7 @@
 
 // -----
 
-func.func @keep_fill(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @keep_fill(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %cst = arith.constant 0.0 : f32
@@ -111,24 +111,24 @@
   %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
   %gemm = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %gemm : tensor<?x?xf32>
+  util.return %gemm : tensor<?x?xf32>
 }
-// CHECK-LABEL: func.func @keep_fill
+// CHECK-LABEL: util.func public @keep_fill
 //   CHECK-NOT: linalg.generic
 
 // -----
 
-func.func @keep_arg(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @keep_arg(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
-// CHECK-LABEL: func.func @keep_arg
+// CHECK-LABEL: util.func public @keep_arg
 //   CHECK-NOT: linalg.generic
 
 // -----
 
-func.func @fft_cst_output(%arg0 : tensor<3x2190x1x512xf32>)
+util.func public @fft_cst_output(%arg0 : tensor<3x2190x1x512xf32>)
     -> (tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>) {
   %c1 = arith.constant 1 : index
   %cst = arith.constant dense<1.000000e+00> : tensor<1xf32>
@@ -138,9 +138,9 @@
       ins(%c1, %cst, %cst_0 : index, tensor<1xf32>, tensor<1xf32>)
       outs(%arg0, %cst_1 : tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>)
       : tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>
-  return %0#0, %0#1 : tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>
+  util.return %0#0, %0#1 : tensor<3x2190x1x512xf32>, tensor<3x2190x1x512xf32>
 }
-// CHECK-LABEL: func @fft_cst_output(
+// CHECK-LABEL: util.func public @fft_cst_output(
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<3x2190x1x512xf32>
 //   CHECK-DAG:   %[[C0:.+]] = arith.constant 0.000000e+00 : f32
 //   CHECK-DAG:   %[[INIT:.+]] = tensor.empty()
@@ -149,7 +149,7 @@
 //  CHECK-SAME:       outs(%[[INIT]] :
 //       CHECK:   %[[FFT:.+]]:2 = iree_linalg_ext.fft
 //  CHECK-SAME:       outs(%[[ARG0]], %[[FILL]] :
-//       CHECK:   return %[[FFT]]#0, %[[FFT]]#1
+//       CHECK:   util.return %[[FFT]]#0, %[[FFT]]#1
 
 // -----
 
@@ -158,7 +158,7 @@
 #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d3, d1 * 2 + d4, d2)>
 #map3 = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>
 #map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
-func.func @generic_cst_output(%arg0 : tensor<114x114x64xf32>) -> tensor<56x56x64xf32> {
+util.func public @generic_cst_output(%arg0 : tensor<114x114x64xf32>) -> tensor<56x56x64xf32> {
   %cst = arith.constant dense<0xFF800000> : tensor<56x56x64xf32>
   %1 = tensor.empty() : tensor<3x3xf32>
   %2 = linalg.generic {
@@ -168,9 +168,9 @@
     %3 = arith.maximumf %out, %in : f32
     linalg.yield %3 : f32
   } -> tensor<56x56x64xf32>
-  return %2 : tensor<56x56x64xf32>
+  util.return %2 : tensor<56x56x64xf32>
 }
-// CHECK-LABEL: func @generic_cst_output
+// CHECK-LABEL: util.func public @generic_cst_output
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<114x114x64xf32>
 //       CHECK:   %[[CST:.+]] = arith.constant 0xFF800000 : f32
 //       CHECK:   %[[INIT:.+]] = tensor.empty() : tensor<56x56x64xf32>
@@ -179,4 +179,4 @@
 //  CHECK-SAME:       outs(%[[INIT]] :
 //       CHECK:   %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:       outs(%[[FILL]] :
-//       CHECK:   return %[[GENERIC]]
+//       CHECK:   util.return %[[GENERIC]]

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/expand_tensor_shapes.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/expand_tensor_shapes.mlir
index 7f8c1c2..de519db 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/expand_tensor_shapes.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/expand_tensor_shapes.mlir

@@ -8,7 +8,7 @@
 util.global private mutable @loadedGlobal : tensor<4x?x?x2xf32>
 
 // CHECK-LABEL: @globalLoad
-func.func @globalLoad() {
+util.func private @globalLoad() {
   // CHECK-NEXT: %[[TENSOR:.+]] = util.global.load @loadedGlobal : tensor<4x?x?x2xf32>
   // CHECK-NEXT: %[[D1:.+]] = util.global.load @loadedGlobal__d1 : index
   // CHECK-NEXT: %[[D2:.+]] = util.global.load @loadedGlobal__d2 : index
@@ -16,7 +16,7 @@
   %0 = util.global.load @loadedGlobal : tensor<4x?x?x2xf32>
   // CHECK-NEXT: util.optimization_barrier %[[TIED]]
   util.optimization_barrier %0 : tensor<4x?x?x2xf32>
-  return
+  util.return
 }
 
 // -----
@@ -30,13 +30,13 @@
 
 // CHECK-LABEL: @globalStore
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4x?x?x2xf32>, %[[D1:.+]]: index, %[[D2:.+]]: index)
-func.func @globalStore(%arg0: tensor<4x?x?x2xf32>) {
+util.func private @globalStore(%arg0: tensor<4x?x?x2xf32>) {
   // CHECK-NEXT: %[[TIED:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<4x?x?x2xf32>{%[[D1]], %[[D2]]}
   // CHECK-NEXT: util.global.store %[[ARG0]], @storedGlobal : tensor<4x?x?x2xf32>
   // CHECK-NEXT: util.global.store %[[D1]], @storedGlobal__d1 : index
   // CHECK-NEXT: util.global.store %[[D2]], @storedGlobal__d2 : index
   util.global.store %arg0, @storedGlobal : tensor<4x?x?x2xf32>
-  return
+  util.return
 }
 
 // -----
@@ -46,7 +46,7 @@
 // CHECK-LABEL: @funcArgs
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4x?x?x2xf32>, %[[ARG0_D1:.+]]: index, %[[ARG0_D2:.+]]: index,
 // CHECK-SAME:  %[[ARG1:.+]]: tensor<?xi32>, %[[ARG1_D0:.+]]: index)
-func.func @funcArgs(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) {
+util.func private @funcArgs(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) {
   // CHECK-NEXT: %[[TIED_ARG0:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<4x?x?x2xf32>{%[[ARG0_D1]], %[[ARG0_D2]]}
   // CHECK-NEXT: %[[TIED_ARG1:.+]] = flow.tensor.tie_shape %[[ARG1]] : tensor<?xi32>{%[[ARG1_D0]]}
 
@@ -55,7 +55,7 @@
   // CHECK-NEXT: util.optimization_barrier %[[TIED_ARG1]]
   util.optimization_barrier %arg1 : tensor<?xi32>
 
-  return
+  util.return
 }
 
 // -----
@@ -65,14 +65,14 @@
 // CHECK-LABEL: @funcResults
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4x?x?x2xf32>, %[[ARG0_D1:.+]]: index, %[[ARG0_D2:.+]]: index,
 // CHECK-SAME:  %[[ARG1:.+]]: tensor<?xi32>, %[[ARG1_D0:.+]]: index)
-func.func @funcResults(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) -> (tensor<4x?x?x2xf32>, tensor<?xi32>) {
+util.func private @funcResults(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) -> (tensor<4x?x?x2xf32>, tensor<?xi32>) {
   // CHECK-NEXT: %[[TIED_ARG0:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<4x?x?x2xf32>{%[[ARG0_D1]], %[[ARG0_D2]]}
   // CHECK-NEXT: %[[TIED_ARG1:.+]] = flow.tensor.tie_shape %[[ARG1]] : tensor<?xi32>{%[[ARG1_D0]]}
 
-  // NOTE: we return %arg0/%arg1 instead of the tied ones - this helps the ties
+  // NOTE: we util.return %arg0/%arg1 instead of the tied ones - this helps the ties
   // get dropped early when they aren't needed.
-  // CHECK-NEXT: return %[[ARG0]], %[[ARG0_D1]], %[[ARG0_D2]], %[[ARG1]], %[[ARG1_D0]]
-  return %arg0, %arg1 : tensor<4x?x?x2xf32>, tensor<?xi32>
+  // CHECK-NEXT: util.return %[[ARG0]], %[[ARG0_D1]], %[[ARG0_D2]], %[[ARG1]], %[[ARG1_D0]]
+  util.return %arg0, %arg1 : tensor<4x?x?x2xf32>, tensor<?xi32>
 }
 
 // -----
@@ -82,13 +82,13 @@
 // CHECK-LABEL: @caller
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4x?x?x2xf32>, %[[ARG0_D1:.+]]: index, %[[ARG0_D2:.+]]: index,
 // CHECK-SAME:  %[[ARG1:.+]]: tensor<?xi32>, %[[ARG1_D0:.+]]: index)
-func.func @caller(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) {
+util.func private @caller(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) {
   // CHECK-NEXT: %[[TIED_ARG0:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<4x?x?x2xf32>{%[[ARG0_D1]], %[[ARG0_D2]]}
   // CHECK-NEXT: %[[TIED_ARG1:.+]] = flow.tensor.tie_shape %[[ARG1]] : tensor<?xi32>{%[[ARG1_D0]]}
 
-  // CHECK: %[[RET:.+]]:5 = call @callee(%[[ARG0]], %[[ARG0_D1]], %[[ARG0_D2]], %[[ARG1]], %[[ARG1_D0]])
+  // CHECK: %[[RET:.+]]:5 = util.call @callee(%[[ARG0]], %[[ARG0_D1]], %[[ARG0_D2]], %[[ARG1]], %[[ARG1_D0]])
   // CHECK-SAME: (tensor<4x?x?x2xf32>, index, index, tensor<?xi32>, index) -> (tensor<4x?x?x2xf32>, index, index, tensor<?xi32>, index)
-  %0:2 = call @callee(%arg0, %arg1) : (tensor<4x?x?x2xf32>, tensor<?xi32>) -> (tensor<4x?x?x2xf32>, tensor<?xi32>)
+  %0:2 = util.call @callee(%arg0, %arg1) : (tensor<4x?x?x2xf32>, tensor<?xi32>) -> (tensor<4x?x?x2xf32>, tensor<?xi32>)
 
   // CHECK-NEXT: %[[TIED_RET0:.+]] = flow.tensor.tie_shape %[[RET]]#0 : tensor<4x?x?x2xf32>{%[[RET]]#1, %[[RET]]#2}
   // CHECK-NEXT: %[[TIED_RET1:.+]] = flow.tensor.tie_shape %[[RET]]#3 : tensor<?xi32>{%[[RET]]#4}
@@ -98,10 +98,55 @@
   // CHECK-NEXT: util.optimization_barrier %[[TIED_RET1]]
   util.optimization_barrier %0#1 : tensor<?xi32>
 
-  return
+  util.return
 }
 
-func.func private @callee(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) -> (tensor<4x?x?x2xf32>, tensor<?xi32>)
+util.func private @callee(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) -> (tensor<4x?x?x2xf32>, tensor<?xi32>) {
+  util.return %arg0, %arg1 : tensor<4x?x?x2xf32>, tensor<?xi32>
+}
+
+// -----
+
+// Tests that tied operands are updated when we expand functions/calls.
+
+// CHECK-LABEL: @tiedCaller
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<4x?x?x2xf32>, %[[ARG0_D1:.+]]: index, %[[ARG0_D2:.+]]: index,
+// CHECK-SAME:  %[[ARG1:.+]]: tensor<?xi32>, %[[ARG1_D0:.+]]: index)
+util.func private @tiedCaller(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) {
+  // CHECK-NEXT: %[[TIED_ARG0:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<4x?x?x2xf32>{%[[ARG0_D1]], %[[ARG0_D2]]}
+  // CHECK-NEXT: %[[TIED_ARG1:.+]] = flow.tensor.tie_shape %[[ARG1]] : tensor<?xi32>{%[[ARG1_D0]]}
+
+  // CHECK: %[[RET:.+]]:5 = util.call @tiedCallee(%[[ARG0]], %[[ARG0_D1]], %[[ARG0_D2]], %[[ARG1]], %[[ARG1_D0]])
+  // CHECK-SAME: (tensor<4x?x?x2xf32>, index, index, tensor<?xi32>, index) -> (tensor<4x?x?x2xf32>, index, index, %[[ARG1]], index)
+  %0:2 = util.call @tiedCallee(%arg0, %arg1) : (tensor<4x?x?x2xf32>, tensor<?xi32>) -> (tensor<4x?x?x2xf32>, %arg1)
+
+  // CHECK-NEXT: %[[TIED_RET0:.+]] = flow.tensor.tie_shape %[[RET]]#0 : tensor<4x?x?x2xf32>{%[[RET]]#1, %[[RET]]#2}
+  // CHECK-NEXT: %[[TIED_RET1:.+]] = flow.tensor.tie_shape %[[RET]]#3 : tensor<?xi32>{%[[RET]]#4}
+
+  // CHECK-NEXT: util.optimization_barrier %[[TIED_RET0]]
+  util.optimization_barrier %0#0 : tensor<4x?x?x2xf32>
+  // CHECK-NEXT: util.optimization_barrier %[[TIED_RET1]]
+  util.optimization_barrier %0#1 : tensor<?xi32>
+
+  util.return
+}
+
+// CHECK-LABEL: util.func private @tiedCallee
+// CHECK-SAME: (%[[CALLEE_ARG0:.+]]: tensor<4x?x?x2xf32>, %[[CALLEE_ARG0_D1:.+]]: index, %[[CALLEE_ARG0_D2:.+]]: index,
+// CHECK-SAME:  %[[CALLEE_ARG1:.+]]: tensor<?xi32>, %[[CALLEE_ARG1_D0:.+]]: index)
+// CHECK-SAME: -> (tensor<4x?x?x2xf32>, index, index, %[[CALLEE_ARG1]], index)
+util.func private @tiedCallee(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) -> (tensor<4x?x?x2xf32>, %arg1) {
+  // CHECK-NEXT: %[[CALLEE_TIED_ARG0:.+]] = flow.tensor.tie_shape %[[CALLEE_ARG0]] : tensor<4x?x?x2xf32>{%[[CALLEE_ARG0_D1]], %[[CALLEE_ARG0_D2]]}
+  // CHECK-NEXT: %[[CALLEE_TIED_ARG1:.+]] = flow.tensor.tie_shape %[[CALLEE_ARG1]] : tensor<?xi32>{%[[CALLEE_ARG1_D0]]}
+
+  // CHECK-NEXT: util.optimization_barrier %[[CALLEE_TIED_ARG0]]
+  util.optimization_barrier %arg0 : tensor<4x?x?x2xf32>
+  // CHECK-NEXT: util.optimization_barrier %[[CALLEE_TIED_ARG1]]
+  util.optimization_barrier %arg1 : tensor<?xi32>
+
+  // CHECK-NEXT: util.return %[[CALLEE_ARG0]], %[[CALLEE_ARG0_D1]], %[[CALLEE_ARG0_D2]], %[[CALLEE_ARG1]], %[[CALLEE_ARG1_D0]]
+  util.return %arg0, %arg1 : tensor<4x?x?x2xf32>, tensor<?xi32>
+}
 
 // -----
 
@@ -110,7 +155,7 @@
 // CHECK-LABEL: @br
 // CHECK-SAME: (%[[ARG0:.+]]: tensor<4x?x?x2xf32>, %[[ARG0_D1:.+]]: index, %[[ARG0_D2:.+]]: index,
 // CHECK-SAME:  %[[ARG1:.+]]: tensor<?xi32>, %[[ARG1_D0:.+]]: index)
-func.func @br(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) {
+util.func private @br(%arg0: tensor<4x?x?x2xf32>, %arg1: tensor<?xi32>) {
   // CHECK-NEXT: %[[TIED_ARG0:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<4x?x?x2xf32>{%[[ARG0_D1]], %[[ARG0_D2]]}
   // CHECK-NEXT: %[[TIED_ARG1:.+]] = flow.tensor.tie_shape %[[ARG1]] : tensor<?xi32>{%[[ARG1_D0]]}
 
@@ -128,7 +173,7 @@
   // CHECK-NEXT: util.optimization_barrier %[[TIED_BB1_ARG1]]
   util.optimization_barrier %bb1_arg1 : tensor<?xi32>
 
-  return
+  util.return
 }
 
 // -----
@@ -138,7 +183,7 @@
 // CHECK-LABEL: @select
 // CHECK-SAME: (%[[COND:.+]]: i1,
 // CHECK-SAME:  %[[ARG0:.+]]: tensor<4x?x?x2xf32>, %[[ARG0_D1:.+]]: index, %[[ARG0_D2:.+]]: index, %[[ARG1:.+]]: tensor<4x?x?x2xf32>, %[[ARG1_D1:.+]]: index, %[[ARG1_D2:.+]]: index)
-func.func @select(%cond: i1, %arg0: tensor<4x?x?x2xf32>, %arg1: tensor<4x?x?x2xf32>) {
+util.func private @select(%cond: i1, %arg0: tensor<4x?x?x2xf32>, %arg1: tensor<4x?x?x2xf32>) {
   // CHECK-NEXT: %[[TIED_ARG0:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<4x?x?x2xf32>{%[[ARG0_D1]], %[[ARG0_D2]]}
   // CHECK-NEXT: %[[TIED_ARG1:.+]] = flow.tensor.tie_shape %[[ARG1]] : tensor<4x?x?x2xf32>{%[[ARG1_D1]], %[[ARG1_D2]]}
 
@@ -151,14 +196,14 @@
   // CHECK-NEXT: util.optimization_barrier %[[SEL_TIED]]
   util.optimization_barrier %0 : tensor<4x?x?x2xf32>
 
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @scf_while
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: i32
-func.func @scf_while(%arg0 : tensor<?xf32>, %arg1 : i32) {
+util.func private @scf_while(%arg0 : tensor<?xf32>, %arg1 : i32) {
   %zero = arith.constant 0 : i32
   %one = arith.constant 1 : i32
   // CHECK:  %[[TIE:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<?xf32>{%[[ARG1]]}
@@ -188,14 +233,14 @@
   // CHECK:  %[[TIE:.+]] = flow.tensor.tie_shape %[[WHILE]]#1 : tensor<?xf32>{%[[WHILE]]#2}
   // CHECK:  %[[BARRIER:.+]] = util.optimization_barrier %[[TIE]]
   util.optimization_barrier %0#1 : tensor<?xf32>
-  return
+  util.return
 }
 
 // -----
 
-// CHECK-LABEL: func.func @scf_if
+// CHECK-LABEL: @scf_if
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: i1
-func.func @scf_if(%arg0 : tensor<?xf32>, %arg1 : i1) {
+util.func private @scf_if(%arg0 : tensor<?xf32>, %arg1 : i1) {
   // CHECK:   %[[TIE:.+]] = flow.tensor.tie_shape %[[ARG0]] : tensor<?xf32>{%[[ARG1]]}
   // CHECK:   %[[IF:.+]]:2 = scf.if %[[ARG2]] -> (tensor<?xf32>, index) {
   %0 = scf.if %arg1 -> tensor<?xf32> {
@@ -213,5 +258,5 @@
   // CHECK:   %[[TIE:.+]] = flow.tensor.tie_shape %[[IF]]#0 : tensor<?xf32>{%[[IF]]#1}
   // CHECK:   %[[BARRIER:.+]] = util.optimization_barrier %[[TIE]] : tensor<?xf32>
   util.optimization_barrier %0 : tensor<?xf32>
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/flow_hoist_into_globals.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/flow_hoist_into_globals.mlir
index 332de0a..8cf38af 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/flow_hoist_into_globals.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/flow_hoist_into_globals.mlir

@@ -2,18 +2,18 @@
 
 // CHECK-LABEL: @hoist_sub_byte_tensor_store
 module @hoist_sub_byte_tensor_store {
-  func.func @main() -> (tensor<64xi4>) {
+  util.func public @main() -> (tensor<64xi4>) {
     %0 = arith.constant dense<3> : tensor<64xi32>
     %2 = "iree_unregistered.const_expr"(%0) : (tensor<64xi32>) -> tensor<64xi4>
-    return %2 : tensor<64xi4>
+    util.return %2 : tensor<64xi4>
   }
 }
 
 // CHECK: util.global private @{{.*}} : tensor<32xi8>
-// CHECK: func.func @main() -> tensor<64xi4>
+// CHECK: util.func public @main() -> tensor<64xi4>
 // CHECK:   %[[GLOBAL_LD:.+]] = util.global.load @{{.*}} : tensor<32xi8>
 // CHECK:   %[[ORIG_VAL:.+]] = flow.tensor.bitcast %[[GLOBAL_LD]] : tensor<32xi8> -> tensor<64xi4>
-// CHECK:   return %[[ORIG_VAL]]
+// CHECK:   util.return %[[ORIG_VAL]]
 
 // CHECK: util.initializer attributes {iree.compiler.consteval}
 // CHECK:   %[[CEXPR:.+]] = "iree_unregistered.const_expr"
@@ -30,21 +30,21 @@
   // CHECK: util.global private @latent_global : tensor<8xi4>
   util.global private @latent_global : tensor<8xi4>
 
-  // CHECK: func.func @main
-  func.func @main() -> (tensor<8xi4>, tensor<8xi4>, tensor<8xi4>) {
+  // CHECK: util.func public @main
+  util.func public @main() -> (tensor<8xi4>, tensor<8xi4>, tensor<8xi4>) {
     // CHECK-DAG: %[[LOAD_HOISTED_0:.*]] = util.global.load @[[HOISTED_0]] : tensor<4xi8>
     // CHECK-DAG: %[[BITCAST_0:.*]] = flow.tensor.bitcast %[[LOAD_HOISTED_0]] : tensor<4xi8> -> tensor<8xi4>
     // CHECK-DAG: %[[LOAD_HOISTED_1:.*]] = util.global.load @[[HOISTED_1]] : tensor<4xi8>
     // CHECK-DAG: %[[BITCAST_1:.*]] = flow.tensor.bitcast %[[LOAD_HOISTED_1]] : tensor<4xi8> -> tensor<8xi4>
     // CHECK-DAG: %[[RESULT:.*]] = "iree_unregistered.var_expr"(%[[BITCAST_1]])
-    // CHECK: return %[[BITCAST_0]], %[[BITCAST_1]], %[[RESULT]]
+    // CHECK: util.return %[[BITCAST_0]], %[[BITCAST_1]], %[[RESULT]]
     %0 = arith.constant dense<0> : tensor<8xi4>
     %1 = arith.constant dense<1> : tensor<8xi4>
     %2 = "iree_unregistered.const_expr"(%0, %1) : (tensor<8xi4>, tensor<8xi4>) -> tensor<8xi4>
     %3 = util.global.load @latent_global : tensor<8xi4>
     %4 = "iree_unregistered.const_expr"(%2, %3) : (tensor<8xi4>, tensor<8xi4>) -> tensor<8xi4>
     %5 = "iree_unregistered.var_expr"(%4) : (tensor<8xi4>) -> tensor<8xi4>
-    return %2, %4, %5 : tensor<8xi4>, tensor<8xi4>, tensor<8xi4>
+    util.return %2, %4, %5 : tensor<8xi4>, tensor<8xi4>, tensor<8xi4>
   }
   // CHECK: util.initializer attributes {iree.compiler.consteval} {
   // CHECK:   %[[C0:.*]] = arith.constant dense<0> : tensor<8xi4>
@@ -70,10 +70,10 @@
 // CHECK-LABEL: @hoist_sub_byte_tensor_transitive
 // CHECK: util.global
 module @hoist_sub_byte_tensor_transitive {
-  func.func @main() -> (i32) {
+  util.func public @main() -> (i32) {
     %0 = arith.constant dense<3> : tensor<i4>
     %2 = "iree_unregistered.const_expr"(%0) : (tensor<i4>) -> i32
-    return %2 : i32
+    util.return %2 : i32
   }
 }
 // We do not need to cast for transitive sub-byte values.
@@ -85,9 +85,9 @@
 // CHECK-LABEL: @do_not_hoist_metadata_leaf
 // CHECK-NOT: util.global
 module @do_not_hoist_metadata_leaf {
-  func.func @main() -> (tensor<1xi32>) {
+  util.func public @main() -> (tensor<1xi32>) {
     %0 = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi8>
     %1 = flow.tensor.bitcast %0 : tensor<4xi8> -> tensor<1xi32>
-    return %1 : tensor<1xi32>
+    util.return %1 : tensor<1xi32>
   }
 }

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/fuse_dequantization_matmul.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/fuse_dequantization_matmul.mlir
index 2b59689..d276a46 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/fuse_dequantization_matmul.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/fuse_dequantization_matmul.mlir

@@ -1,38 +1,36 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-global-opt-fuse-dequantization-matmul{enable-quantized-matmul-reassociation=true},canonicalize))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(util.func(iree-global-opt-fuse-dequantization-matmul{enable-quantized-matmul-reassociation=true},canonicalize))" %s | FileCheck %s
 
-module {
-  func.func @grouped_quantized_matmul_reassociate(%arg0: tensor<11008x32x128xi4>, %arg1: tensor<32x128xf32>, %arg2: tensor<11008x32xf32>, %arg3: tensor<11008x32xf32>) -> tensor<11008xf32> {
-    %cst = arith.constant 0.000000e+00 : f32
-    %0 = tensor.empty() : tensor<11008xf32>
-    %1 = tensor.empty() : tensor<11008x32x128xf32>
-    %2 = linalg.fill ins(%cst : f32) outs(%0 : tensor<11008xf32>) -> tensor<11008xf32>
-    %3 = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                         affine_map<(d0, d1, d2) -> (d0, d1)>,
-                         affine_map<(d0, d1, d2) -> (d0, d1)>,
-                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-        iterator_types = ["parallel", "parallel", "parallel"]}
-        ins(%arg0, %arg2, %arg3 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%1 : tensor<11008x32x128xf32>) {
-    ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
-      %5 = arith.extui %in : i4 to i32
-      %6 = arith.uitofp %5 : i32 to f32
-      %7 = arith.subf %6, %in_1 : f32
-      %8 = arith.mulf %7, %in_0 : f32
-      linalg.yield %8 : f32
-    } -> tensor<11008x32x128xf32>
-    %4 = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
-                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                         affine_map<(d0, d1, d2) -> (d0)>],
-        iterator_types = ["parallel", "reduction", "reduction"]}
-        ins(%arg1, %3 : tensor<32x128xf32>, tensor<11008x32x128xf32>) outs(%2 : tensor<11008xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %5 = arith.mulf %in, %in_0 : f32
-      %6 = arith.addf %5, %out : f32
-      linalg.yield %6 : f32
-    } -> tensor<11008xf32>
-    return %4 : tensor<11008xf32>
-  }
+util.func public @grouped_quantized_matmul_reassociate(%arg0: tensor<11008x32x128xi4>, %arg1: tensor<32x128xf32>, %arg2: tensor<11008x32xf32>, %arg3: tensor<11008x32xf32>) -> tensor<11008xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<11008xf32>
+  %1 = tensor.empty() : tensor<11008x32x128xf32>
+  %2 = linalg.fill ins(%cst : f32) outs(%0 : tensor<11008xf32>) -> tensor<11008xf32>
+  %3 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                        affine_map<(d0, d1, d2) -> (d0, d1)>,
+                        affine_map<(d0, d1, d2) -> (d0, d1)>,
+                        affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+      iterator_types = ["parallel", "parallel", "parallel"]}
+      ins(%arg0, %arg2, %arg3 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%1 : tensor<11008x32x128xf32>) {
+  ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+    %5 = arith.extui %in : i4 to i32
+    %6 = arith.uitofp %5 : i32 to f32
+    %7 = arith.subf %6, %in_1 : f32
+    %8 = arith.mulf %7, %in_0 : f32
+    linalg.yield %8 : f32
+  } -> tensor<11008x32x128xf32>
+  %4 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
+                        affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                        affine_map<(d0, d1, d2) -> (d0)>],
+      iterator_types = ["parallel", "reduction", "reduction"]}
+      ins(%arg1, %3 : tensor<32x128xf32>, tensor<11008x32x128xf32>) outs(%2 : tensor<11008xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %5 = arith.mulf %in, %in_0 : f32
+    %6 = arith.addf %5, %out : f32
+    linalg.yield %6 : f32
+  } -> tensor<11008xf32>
+  util.return %4 : tensor<11008xf32>
 }
 //   CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = affine_map<(d0, d1) -> (d0, d1)>
 //   CHECK-DAG: #[[MAP1:[a-zA-Z0-9]+]] = affine_map<(d0, d1) -> (d0)>
@@ -41,7 +39,7 @@
 //   CHECK-DAG: #[[MAP4:[a-zA-Z0-9]+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //   CHECK-DAG: #[[MAP5:[a-zA-Z0-9]+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 //   CHECK-DAG: #[[MAP6:[a-zA-Z0-9]+]] = affine_map<(d0, d1) -> (d1)>
-//       CHECK: func.func @grouped_quantized_matmul_reassociate(
+//       CHECK: util.func public @grouped_quantized_matmul_reassociate(
 //  CHECK-SAME:   %[[QUANT:[a-zA-Z0-9_]+]]: tensor<11008x32x128xi4>
 //  CHECK-SAME:   %[[UNQUANT:[a-zA-Z0-9_]+]]: tensor<32x128xf32>
 //  CHECK-SAME:   %[[SCALES:[a-zA-Z0-9_]+]]: tensor<11008x32xf32>
@@ -122,43 +120,41 @@
 //       CHECK:   %[[RESUBF:.+]] = arith.subf %[[REMULF1]], %[[REMULF3]] : f32
 //       CHECK:   %[[READDF:.+]] = arith.addf %[[RESUBF]], %[[REOUT0]] : f32
 //       CHECK:   linalg.yield %[[READDF]] : f32
-//       CHECK:   return %[[GENREASSOCIATE]]
+//       CHECK:   util.return %[[GENREASSOCIATE]]
 
 // -----
 
-module {
-  func.func @grouped_quantized_matmul_reassociate_f16(%arg0: tensor<11008x32x128xi4>, %arg1: tensor<32x128xf16>, %arg2: tensor<11008x32xf16>, %arg3: tensor<11008x32xf16>) -> tensor<11008xf16> {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<11008xf16>
-    %1 = tensor.empty() : tensor<11008x32x128xf16>
-    %2 = linalg.fill ins(%cst : f16) outs(%0 : tensor<11008xf16>) -> tensor<11008xf16>
-    %3 = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                         affine_map<(d0, d1, d2) -> (d0, d1)>,
-                         affine_map<(d0, d1, d2) -> (d0, d1)>,
-                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-        iterator_types = ["parallel", "parallel", "parallel"]}
-        ins(%arg0, %arg2, %arg3 : tensor<11008x32x128xi4>, tensor<11008x32xf16>, tensor<11008x32xf16>) outs(%1 : tensor<11008x32x128xf16>) {
-    ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
-      %5 = arith.extui %in : i4 to i32
-      %6 = arith.uitofp %5 : i32 to f16
-      %7 = arith.subf %6, %in_1 : f16
-      %8 = arith.mulf %7, %in_0 : f16
-      linalg.yield %8 : f16
-    } -> tensor<11008x32x128xf16>
-    %4 = linalg.generic {
-        indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
-                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                         affine_map<(d0, d1, d2) -> (d0)>],
-        iterator_types = ["parallel", "reduction", "reduction"]}
-        ins(%arg1, %3 : tensor<32x128xf16>, tensor<11008x32x128xf16>) outs(%2 : tensor<11008xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %5 = arith.mulf %in, %in_0 : f16
-      %6 = arith.addf %5, %out : f16
-      linalg.yield %6 : f16
-    } -> tensor<11008xf16>
-    return %4 : tensor<11008xf16>
-  }
+util.func public @grouped_quantized_matmul_reassociate_f16(%arg0: tensor<11008x32x128xi4>, %arg1: tensor<32x128xf16>, %arg2: tensor<11008x32xf16>, %arg3: tensor<11008x32xf16>) -> tensor<11008xf16> {
+  %cst = arith.constant 0.000000e+00 : f16
+  %0 = tensor.empty() : tensor<11008xf16>
+  %1 = tensor.empty() : tensor<11008x32x128xf16>
+  %2 = linalg.fill ins(%cst : f16) outs(%0 : tensor<11008xf16>) -> tensor<11008xf16>
+  %3 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                        affine_map<(d0, d1, d2) -> (d0, d1)>,
+                        affine_map<(d0, d1, d2) -> (d0, d1)>,
+                        affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+      iterator_types = ["parallel", "parallel", "parallel"]}
+      ins(%arg0, %arg2, %arg3 : tensor<11008x32x128xi4>, tensor<11008x32xf16>, tensor<11008x32xf16>) outs(%1 : tensor<11008x32x128xf16>) {
+  ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16):
+    %5 = arith.extui %in : i4 to i32
+    %6 = arith.uitofp %5 : i32 to f16
+    %7 = arith.subf %6, %in_1 : f16
+    %8 = arith.mulf %7, %in_0 : f16
+    linalg.yield %8 : f16
+  } -> tensor<11008x32x128xf16>
+  %4 = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>,
+                        affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+                        affine_map<(d0, d1, d2) -> (d0)>],
+      iterator_types = ["parallel", "reduction", "reduction"]}
+      ins(%arg1, %3 : tensor<32x128xf16>, tensor<11008x32x128xf16>) outs(%2 : tensor<11008xf16>) {
+  ^bb0(%in: f16, %in_0: f16, %out: f16):
+    %5 = arith.mulf %in, %in_0 : f16
+    %6 = arith.addf %5, %out : f16
+    linalg.yield %6 : f16
+  } -> tensor<11008xf16>
+  util.return %4 : tensor<11008xf16>
 }
 
 //   CHECK-DAG: #[[MAP0:[a-zA-Z0-9]+]] = affine_map<(d0, d1) -> (d0, d1)>
@@ -168,7 +164,7 @@
 //   CHECK-DAG: #[[MAP4:[a-zA-Z0-9]+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //   CHECK-DAG: #[[MAP5:[a-zA-Z0-9]+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 //   CHECK-DAG: #[[MAP6:[a-zA-Z0-9]+]] = affine_map<(d0, d1) -> (d1)>
-//       CHECK: func.func @grouped_quantized_matmul_reassociate_f16(
+//       CHECK: util.func public @grouped_quantized_matmul_reassociate_f16(
 //  CHECK-SAME:   %[[QUANT:[a-zA-Z0-9_]+]]: tensor<11008x32x128xi4>
 //  CHECK-SAME:   %[[UNQUANT:[a-zA-Z0-9_]+]]: tensor<32x128xf16>
 //  CHECK-SAME:   %[[SCALES:[a-zA-Z0-9_]+]]: tensor<11008x32xf16>
@@ -249,4 +245,4 @@
 //       CHECK:   %[[RESUBF:.+]] = arith.subf %[[REMULF1]], %[[REMULF3]] : f16
 //       CHECK:   %[[READDF:.+]] = arith.addf %[[RESUBF]], %[[REOUT0]] : f16
 //       CHECK:   linalg.yield %[[READDF]] : f16
-//       CHECK:   return %[[GENREASSOCIATE]]
+//       CHECK:   util.return %[[GENREASSOCIATE]]

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/fuse_silu_horizontal_matmul.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/fuse_silu_horizontal_matmul.mlir
index 9aecd84..7afea1d 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/fuse_silu_horizontal_matmul.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/fuse_silu_horizontal_matmul.mlir

@@ -1,40 +1,38 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-global-opt-fuse-silu-horizontal-matmul,canonicalize))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(util.func(iree-global-opt-fuse-silu-horizontal-matmul,canonicalize))" %s | FileCheck %s
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
-module {
-  func.func @silu_horizontal_matmul_fusion(%arg0: index, %arg1: tensor<?x5120xf16>, %arg2: tensor<13824x5120xf16>, %arg3: tensor<13824x5120xf16>) -> tensor<?x13824xf16> {
-    %cst = arith.constant 1.000000e+00 : f16
-    %cst_0 = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty(%arg0) : tensor<?x13824xf16>
-    %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<?x13824xf16>) -> tensor<?x13824xf16>
-    %2 = linalg.matmul_transpose_b ins(%arg1, %arg2 : tensor<?x5120xf16>, tensor<13824x5120xf16>) outs(%1 : tensor<?x13824xf16>) -> tensor<?x13824xf16>
-    %3 = tensor.empty(%arg0) : tensor<?x13824xf16>
-    %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<?x13824xf16>) outs(%3 : tensor<?x13824xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %10 = arith.negf %in : f16
-      %11 = math.exp %10 : f16
-      %12 = arith.addf %11, %cst_0 : f16
-      %13 = arith.divf %cst_0, %12 : f16
-      linalg.yield %13 : f16
-    } -> tensor<?x13824xf16>
-    %5 = tensor.empty(%arg0) : tensor<?x13824xf16>
-    %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %2 : tensor<?x13824xf16>, tensor<?x13824xf16>) outs(%5 : tensor<?x13824xf16>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16):
-      %10 = arith.mulf %in, %in_1 : f16
-      linalg.yield %10 : f16
-    } -> tensor<?x13824xf16>
-    %7 = linalg.matmul_transpose_b ins(%arg1, %arg3 : tensor<?x5120xf16>, tensor<13824x5120xf16>) outs(%1 : tensor<?x13824xf16>) -> tensor<?x13824xf16>
-    %8 = tensor.empty(%arg0) : tensor<?x13824xf16>
-    %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%6, %7 : tensor<?x13824xf16>, tensor<?x13824xf16>) outs(%8 : tensor<?x13824xf16>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16):
-      %10 = arith.mulf %in, %in_1 : f16
-      linalg.yield %10 : f16
-    } -> tensor<?x13824xf16>
-    return %9 : tensor<?x13824xf16>
-  }
+util.func public @silu_horizontal_matmul_fusion(%arg0: index, %arg1: tensor<?x5120xf16>, %arg2: tensor<13824x5120xf16>, %arg3: tensor<13824x5120xf16>) -> tensor<?x13824xf16> {
+  %cst = arith.constant 1.000000e+00 : f16
+  %cst_0 = arith.constant 0.000000e+00 : f16
+  %0 = tensor.empty(%arg0) : tensor<?x13824xf16>
+  %1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<?x13824xf16>) -> tensor<?x13824xf16>
+  %2 = linalg.matmul_transpose_b ins(%arg1, %arg2 : tensor<?x5120xf16>, tensor<13824x5120xf16>) outs(%1 : tensor<?x13824xf16>) -> tensor<?x13824xf16>
+  %3 = tensor.empty(%arg0) : tensor<?x13824xf16>
+  %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<?x13824xf16>) outs(%3 : tensor<?x13824xf16>) {
+  ^bb0(%in: f16, %out: f16):
+    %10 = arith.negf %in : f16
+    %11 = math.exp %10 : f16
+    %12 = arith.addf %11, %cst_0 : f16
+    %13 = arith.divf %cst_0, %12 : f16
+    linalg.yield %13 : f16
+  } -> tensor<?x13824xf16>
+  %5 = tensor.empty(%arg0) : tensor<?x13824xf16>
+  %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %2 : tensor<?x13824xf16>, tensor<?x13824xf16>) outs(%5 : tensor<?x13824xf16>) {
+  ^bb0(%in: f16, %in_1: f16, %out: f16):
+    %10 = arith.mulf %in, %in_1 : f16
+    linalg.yield %10 : f16
+  } -> tensor<?x13824xf16>
+  %7 = linalg.matmul_transpose_b ins(%arg1, %arg3 : tensor<?x5120xf16>, tensor<13824x5120xf16>) outs(%1 : tensor<?x13824xf16>) -> tensor<?x13824xf16>
+  %8 = tensor.empty(%arg0) : tensor<?x13824xf16>
+  %9 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%6, %7 : tensor<?x13824xf16>, tensor<?x13824xf16>) outs(%8 : tensor<?x13824xf16>) {
+  ^bb0(%in: f16, %in_1: f16, %out: f16):
+    %10 = arith.mulf %in, %in_1 : f16
+    linalg.yield %10 : f16
+  } -> tensor<?x13824xf16>
+  util.return %9 : tensor<?x13824xf16>
 }
 //   CHECK-DAG: #[[MAP:[a-zA-Z0-9]+]] = affine_map<(d0, d1) -> (d0, d1)>
-//       CHECK: func.func @silu_horizontal_matmul_fusion(
+//       CHECK: util.func public @silu_horizontal_matmul_fusion(
 //  CHECK-SAME:   %[[IN0:.+]]: index,
 //  CHECK-SAME:   %[[IN1:.+]]: tensor<?x5120xf16>,
 //  CHECK-SAME:   %[[IN2:.+]]: tensor<13824x5120xf16>,
@@ -70,4 +68,4 @@
 //       CHECK:   } -> tensor<?x13824xf16>
 //       CHECK:   flow.return %[[OUTPUT]] : tensor<?x13824xf16>
 //       CHECK: }
-//       CHECK: return %[[DISPATCH]] : tensor<?x13824xf16>
+//       CHECK: util.return %[[DISPATCH]] : tensor<?x13824xf16>

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/generalize_named_ops.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/generalize_named_ops.mlir
index 2f97a65..b3413b3 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/generalize_named_ops.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/generalize_named_ops.mlir

@@ -1,6 +1,6 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-global-opt-generalize-linalg-named-ops))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-global-opt-generalize-linalg-named-ops))" --split-input-file %s | FileCheck %s
 
-func.func @generalize_op(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @generalize_op(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
@@ -8,15 +8,15 @@
   %empty = tensor.empty(%d0, %d1): tensor<?x?xf32>
   %add = linalg.add ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %add : tensor<?x?xf32>
+  util.return %add : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @generalize_op
+// CHECK-LABEL: util.func public @generalize_op
 //       CHECK:   %[[GENERIC:.+]] = linalg.generic
-//       CHECK:   return %[[GENERIC]]
+//       CHECK:   util.return %[[GENERIC]]
 
 // -----
 
-func.func @no_generalize_op_within_dispatch(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+util.func public @no_generalize_op_within_dispatch(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
@@ -27,11 +27,11 @@
         outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
     flow.return %add : tensor<?x?xf32>
   }
-  return %dispatch : tensor<?x?xf32>
+  util.return %dispatch : tensor<?x?xf32>
 }
-// CHECK-LABEL: func @no_generalize_op_within_dispatch
+// CHECK-LABEL: util.func public @no_generalize_op_within_dispatch
 //       CHECK:   %[[DISPATCH:.+]] = flow.dispatch.region
 //       CHECK:     %[[ADD:.+]] = linalg.add
 //       CHECK:     flow.return %[[ADD]]
-//       CHECK:   return %[[DISPATCH]]
+//       CHECK:   util.return %[[DISPATCH]]
 

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/hoist_into_globals_linalg.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/hoist_into_globals_linalg.mlir
index c934b30..5724d48 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/hoist_into_globals_linalg.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/hoist_into_globals_linalg.mlir

@@ -6,8 +6,8 @@
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @compute_hoisted {
   // CHECK: util.global private @[[HOISTED:.*]] : tensor<5x6xf32>
-  // CHECK: func.func @main
-  func.func @main() -> (tensor<5x6xf32>) {
+  // CHECK: util.func public @main
+  util.func public @main() -> (tensor<5x6xf32>) {
     %cst_0 = arith.constant dense<1.270000e+02> : tensor<f32>
 
     // A non-leaf broadcast.
@@ -26,8 +26,8 @@
     } -> tensor<5x6xf32>
 
     // CHECK: %[[RESULT:.*]] = util.global.load @[[HOISTED]] : tensor<5x6xf32>
-    // CHECK: return %[[RESULT]]
-    return %3 : tensor<5x6xf32>
+    // CHECK: util.return %[[RESULT]]
+    util.return %3 : tensor<5x6xf32>
   }
   // CHECK: util.initializer
 }
@@ -41,8 +41,8 @@
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 module @broadcast_treated_as_leaf {
   // CHECK-NOT: util.global
-  // CHECK: func.func @main
-  func.func @main() -> (tensor<5x6xf32>) {
+  // CHECK: util.func public @main
+  util.func public @main() -> (tensor<5x6xf32>) {
     %cst_0 = arith.constant dense<1.270000e+02> : tensor<f32>
     // CHECK: tensor.empty()
     %0 = tensor.empty() : tensor<5x6xf32>
@@ -52,8 +52,8 @@
     ^bb0(%arg1: f32, %arg2: f32):  // no predecessors
       linalg.yield %arg1 : f32
     } -> tensor<5x6xf32>
-    // CHECK: return
-    return %1 : tensor<5x6xf32>
+    // CHECK: util.return
+    util.return %1 : tensor<5x6xf32>
   }
   // CHECK-NOT: util.initializer
 }

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/infer_numeric_narrowing.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/infer_numeric_narrowing.mlir
index 025a981..3728f74 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/infer_numeric_narrowing.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/infer_numeric_narrowing.mlir

@@ -7,7 +7,7 @@
 // Checks as a by-product:
 //   - Infering ui0 for [0, 0] range
 //   - Infering unsigned for >= 0 range
-func.func @probe_linalg_op(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
+util.func public @probe_linalg_op(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
   // CHECK-DAG: %[[RHS:.*]] = arith.constant dense
   // CHECK-DAG: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
   // CHECK-DAG: util.numeric.optional_narrow %[[ZERO]] : f32 as ui0
@@ -20,56 +20,56 @@
   %0 = tensor.empty() : tensor<5x1xf32>
   %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }
 
 // CHECK-LABEL: @infer_symmetric_signed
 // CHECK: util.numeric.optional_narrow %{{.*}} : tensor<3x1xf32> as si8 {max_value = 127 : si8, min_value = -39 : si8}
-func.func @infer_symmetric_signed(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
+util.func public @infer_symmetric_signed(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
   %rhs = arith.constant dense<
     [[-3.900000e+01], [0.000000e+00], [1.270000e+02]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<5x1xf32>
   %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }
 
 // CHECK-LABEL: @infer_i1_signed
 // Signed i1 is a silly boundary condition worth checking.
 // CHECK: util.numeric.optional_narrow %{{.*}} : tensor<3x1xf32> as si1 {max_value = 0 : si1, min_value = -1 : si1}
-func.func @infer_i1_signed(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
+util.func public @infer_i1_signed(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
   %rhs = arith.constant dense<
     [[0.000000e+00], [0.000000e+00], [-1.000000e+00]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<5x1xf32>
   %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }
 
 // CHECK-LABEL: @infer_positive_non_straddling_zero
 // A range that does not straddle zero is a special case in the code.
 // CHECK: util.numeric.optional_narrow %{{.*}} : tensor<3x1xf32> as ui2 {max_value = 2 : ui2, min_value = 1 : ui2}
-func.func @infer_positive_non_straddling_zero(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
+util.func public @infer_positive_non_straddling_zero(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
   %rhs = arith.constant dense<
     [[1.000000e+00], [1.000000e+00], [2.000000e+00]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<5x1xf32>
   %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }
 
 // CHECK-LABEL: @infer_negative_non_straddling_zero
 // A range that does not straddle zero is a special case in the code.
 // CHECK: util.numeric.optional_narrow %{{.*}} : tensor<3x1xf32> as si2 {max_value = -1 : si2, min_value = -2 : si2}
-func.func @infer_negative_non_straddling_zero(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
+util.func public @infer_negative_non_straddling_zero(%arg0 : tensor<5x3xf32>) -> tensor<5x1xf32> {
   %rhs = arith.constant dense<
     [[-1.000000e+00], [-1.000000e+00], [-2.000000e+00]]> : tensor<3x1xf32>
   %init_value = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<5x1xf32>
   %1 = linalg.fill ins(%init_value : f32) outs(%0 : tensor<5x1xf32>) -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%arg0, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%1 : tensor<5x1xf32>) -> tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/materialize_homogeneous_encodings.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/materialize_homogeneous_encodings.mlir
index 009a211..61b2572 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/materialize_homogeneous_encodings.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/materialize_homogeneous_encodings.mlir

@@ -7,7 +7,7 @@
 #map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_llvm_cpu = #hal.device.target<"llvm-cpu", {executable_targets = [#executable_target_embedded_elf_x86_64_]}>
 module attributes {hal.device.targets = [#device_target_llvm_cpu]} {
-  func.func @lhs_encoding(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  util.func public @lhs_encoding(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
@@ -22,10 +22,10 @@
     } : tensor<?x?xf32> to tensor<?x?xf32>
     %3 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map1, #map2, #map3]>>
     %4 = iree_linalg_ext.unset_encoding %3 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map1, #map2, #map3]>> -> tensor<?x?xf32>
-    return %4 : tensor<?x?xf32>
+    util.return %4 : tensor<?x?xf32>
   }
 }
-// CHECK-LABEL: func.func @lhs_encoding
+// CHECK-LABEL: util.func public @lhs_encoding
 // CHECK:         tensor.pack
 // CHECK:         tensor.unpack
 
@@ -38,7 +38,7 @@
 #map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}>
 module attributes {hal.device.targets = [#device_target_vulkan]} {
-  func.func @lhs_encoding(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  util.func public @lhs_encoding(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
@@ -53,14 +53,14 @@
     } : tensor<?x?xf32> to tensor<?x?xf32>
     %3 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map1, #map2, #map3]>>
     %4 = iree_linalg_ext.unset_encoding %3 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map1, #map2, #map3]>> -> tensor<?x?xf32>
-    return %4 : tensor<?x?xf32>
+    util.return %4 : tensor<?x?xf32>
   }
 }
 
 // vulkan uses default materialization patterns which unsets the encodings.
-// CHECK-LABEL: func.func @lhs_encoding
+// CHECK-LABEL: util.func public @lhs_encoding
 // CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
-// CHECK:         return %[[ARG0]]
+// CHECK:         util.return %[[ARG0]]
 
 // -----
 
@@ -73,7 +73,7 @@
 #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan", "vulkan-spirv-fb">
 #device_target_vulkan = #hal.device.target<"vulkan", {executable_targets = [#executable_target_vulkan_spirv_fb], legacy_sync}>
 module attributes {hal.device.targets = [#device_target_vulkan, #device_target_llvm_cpu]} {
-  func.func @lhs_encoding(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  util.func public @lhs_encoding(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
@@ -88,11 +88,11 @@
     } : tensor<?x?xf32> to tensor<?x?xf32>
     %3 = iree_linalg_ext.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map1, #map2, #map3]>>
     %4 = iree_linalg_ext.unset_encoding %3 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32], user_indexing_maps = [#map1, #map2, #map3]>> -> tensor<?x?xf32>
-    return %4 : tensor<?x?xf32>
+    util.return %4 : tensor<?x?xf32>
   }
 }
 
 // Multiple targets are currently unsupported.
-// CHECK-LABEL: func.func @lhs_encoding
+// CHECK-LABEL: util.func public @lhs_encoding
 // CHECK-SAME:    %[[ARG0:[a-zA-Z0-9]+]]
-// CHECK:         return %[[ARG0]]
+// CHECK:         util.return %[[ARG0]]

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/optimize_numerics.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/optimize_numerics.mlir
index 9d002cb..f31bb33 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/optimize_numerics.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/optimize_numerics.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --iree-global-opt-optimize-numerics %s | FileCheck %s
 
 // CHECK-LABEL: @matmul_i8_i8_i32_unsigned
-func.func @matmul_i8_i8_i32_unsigned(%arg0 : tensor<5x3xf32>, %arg1 : tensor<3x1xf32>, %arg2 : tensor<5x1xf32>) -> tensor<5x1xf32> {
+util.func public @matmul_i8_i8_i32_unsigned(%arg0 : tensor<5x3xf32>, %arg1 : tensor<3x1xf32>, %arg2 : tensor<5x1xf32>) -> tensor<5x1xf32> {
   // CHECK: %[[LHS:.*]] = arith.fptoui %arg0 : tensor<5x3xf32> to tensor<5x3xi8>
   // CHECK: %[[RHS:.*]] = arith.fptoui %arg1 : tensor<3x1xf32> to tensor<3x1xi8>
   // CHECK: %[[INIT:.*]] = arith.fptoui %arg2 : tensor<5x1xf32> to tensor<5x1xi32>
@@ -11,11 +11,11 @@
   // CHECK: %[[RESULT:.*]] = linalg.matmul_unsigned ins(%[[LHS]], %[[RHS]] : tensor<5x3xi8>, tensor<3x1xi8>) outs(%[[INIT]] : tensor<5x1xi32>)
   %2 = linalg.matmul ins(%lhs, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%init : tensor<5x1xf32>) -> tensor<5x1xf32>
   // CHECK: arith.uitofp %[[RESULT]] : tensor<5x1xi32> to tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }
 
 // CHECK-LABEL: @matmul_i8_i8_i32_signed
-func.func @matmul_i8_i8_i32_signed(%arg0 : tensor<5x3xf32>, %arg1 : tensor<3x1xf32>, %arg2 : tensor<5x1xf32>) -> tensor<5x1xf32> {
+util.func public @matmul_i8_i8_i32_signed(%arg0 : tensor<5x3xf32>, %arg1 : tensor<3x1xf32>, %arg2 : tensor<5x1xf32>) -> tensor<5x1xf32> {
   // CHECK: %[[LHS:.*]] = arith.fptosi %arg0 : tensor<5x3xf32> to tensor<5x3xi8>
   // CHECK: %[[RHS:.*]] = arith.fptosi %arg1 : tensor<3x1xf32> to tensor<3x1xi8>
   // CHECK: %[[INIT:.*]] = arith.fptosi %arg2 : tensor<5x1xf32> to tensor<5x1xi32>
@@ -25,12 +25,12 @@
   // CHECK: %[[RESULT:.*]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<5x3xi8>, tensor<3x1xi8>) outs(%[[INIT]] : tensor<5x1xi32>)
   %2 = linalg.matmul ins(%lhs, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%init : tensor<5x1xf32>) -> tensor<5x1xf32>
   // CHECK: arith.sitofp %[[RESULT]] : tensor<5x1xi32> to tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }
 
 // CHECK-LABEL: @matmul_i4_i4_i32_signed
 // For now we clamp this to i8
-func.func @matmul_i4_i4_i32_signed(%arg0 : tensor<5x3xf32>, %arg1 : tensor<3x1xf32>, %arg2 : tensor<5x1xf32>) -> tensor<5x1xf32> {
+util.func public @matmul_i4_i4_i32_signed(%arg0 : tensor<5x3xf32>, %arg1 : tensor<3x1xf32>, %arg2 : tensor<5x1xf32>) -> tensor<5x1xf32> {
   // CHECK: %[[LHS:.*]] = arith.fptosi %arg0 : tensor<5x3xf32> to tensor<5x3xi8>
   // CHECK: %[[RHS:.*]] = arith.fptosi %arg1 : tensor<3x1xf32> to tensor<3x1xi8>
   // CHECK: %[[INIT:.*]] = arith.fptosi %arg2 : tensor<5x1xf32> to tensor<5x1xi32>
@@ -40,38 +40,38 @@
   // CHECK: %[[RESULT:.*]] = linalg.matmul ins(%[[LHS]], %[[RHS]] : tensor<5x3xi8>, tensor<3x1xi8>) outs(%[[INIT]] : tensor<5x1xi32>)
   %2 = linalg.matmul ins(%lhs, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%init : tensor<5x1xf32>) -> tensor<5x1xf32>
   // CHECK: arith.sitofp %[[RESULT]] : tensor<5x1xi32> to tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }
 
 // CHECK-LABEL: @matmul_reject_gt_8bit
 // We may relax this restriction at some point but for right now we have it
 // because less analysis is needed to prove safety.
 // CHECK-NOT: fptosi
-func.func @matmul_reject_gt_8bit(%arg0 : tensor<5x3xf32>, %arg1 : tensor<3x1xf32>, %arg2 : tensor<5x1xf32>) -> tensor<5x1xf32> {
+util.func public @matmul_reject_gt_8bit(%arg0 : tensor<5x3xf32>, %arg1 : tensor<3x1xf32>, %arg2 : tensor<5x1xf32>) -> tensor<5x1xf32> {
   %lhs = util.numeric.optional_narrow %arg0 : tensor<5x3xf32> as ui9 {max_value = 312 : ui9, min_value = 0 : ui9}
   %rhs = util.numeric.optional_narrow %arg1 : tensor<3x1xf32> as si8 {max_value = 127 : si8, min_value = -127 : si8}
   %init = util.numeric.optional_narrow %arg2 : tensor<5x1xf32> as ui0
   // CHECK: linalg.matmul {{.*}} -> tensor<5x1xf32>
   %2 = linalg.matmul ins(%lhs, %rhs : tensor<5x3xf32>, tensor<3x1xf32>) outs(%init : tensor<5x1xf32>) -> tensor<5x1xf32>
-  return %2 : tensor<5x1xf32>
+  util.return %2 : tensor<5x1xf32>
 }
 
 // CHECK-LABEL: @cast_fill
-func.func @cast_fill(%arg0 : f32, %arg1 : tensor<3xf32>) -> tensor<3xi8> {
+util.func public @cast_fill(%arg0 : f32, %arg1 : tensor<3xf32>) -> tensor<3xi8> {
   // CHECK: %[[SCALAR:.*]] = arith.fptosi %arg0 : f32 to i8
   // CHECK: %[[INIT:.*]] = arith.fptosi %arg1 : tensor<3xf32> to tensor<3xi8>
   // CHECK: %[[RESULT:.*]] = linalg.fill ins(%[[SCALAR]] : i8) outs(%[[INIT]] : tensor<3xi8>) -> tensor<3xi8>
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = linalg.fill ins(%arg0 : f32) outs(%arg1 : tensor<3xf32>) -> tensor<3xf32>
   %1 = arith.fptosi %0 : tensor<3xf32> to tensor<3xi8>
-  return %1 : tensor<3xi8>
+  util.return %1 : tensor<3xi8>
 }
 
 // CHECK-LABEL: @cast_init
-func.func @cast_init() -> tensor<5x9xi8> {
+util.func public @cast_init() -> tensor<5x9xi8> {
   // CHECK: %[[RESULT:.*]] = tensor.empty() : tensor<5x9xi8>
-  // CHECK: return %[[RESULT]]
+  // CHECK: util.return %[[RESULT]]
   %0 = tensor.empty() : tensor<5x9xf32>
   %1 = arith.fptosi %0 : tensor<5x9xf32> to tensor<5x9xi8>
-  return %1 : tensor<5x9xi8>
+  util.return %1 : tensor<5x9xi8>
 }

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir
index b1f0d8f..05f5d0b 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/propagate_linalg_transpose.mlir

@@ -1,97 +1,97 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-global-opt-propagate-linalg-transpose))" --split-input-file %s | FileCheck %s
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-global-opt-propagate-linalg-transpose{enable-aggressive-propagation=true}))" --split-input-file %s | FileCheck %s --check-prefix=APROP
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-global-opt-propagate-linalg-transpose{test-sinking-only=true}))" --split-input-file %s | FileCheck %s --check-prefix=SINK
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-global-opt-propagate-linalg-transpose))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-global-opt-propagate-linalg-transpose{enable-aggressive-propagation=true}))" --split-input-file %s | FileCheck %s --check-prefix=APROP
+// RUN: iree-opt --pass-pipeline="builtin.module(util.func(iree-global-opt-propagate-linalg-transpose{test-sinking-only=true}))" --split-input-file %s | FileCheck %s --check-prefix=SINK
 
-func.func @specialize_transpose_op(%arg0 : tensor<1x2x3xf32>,
+util.func public @specialize_transpose_op(%arg0 : tensor<1x2x3xf32>,
                                    %empty : tensor<3x2x1xf32>) -> tensor<3x2x1xf32> {
   %transposed = linalg.generic {indexing_maps = [
                     affine_map<(d0, d1, d2) -> (d0, d2, d1)>,
                     affine_map<(d0, d1, d2) -> (d1, d2, d0)>],
-                iterator_types = ["parallel", "parallel", "parallel"]} 
+                iterator_types = ["parallel", "parallel", "parallel"]}
                 ins(%arg0 : tensor<1x2x3xf32>)
                 outs(%empty : tensor<3x2x1xf32>) {
                   ^bb0(%in: f32, %out: f32):
                     linalg.yield %in : f32
                   } -> tensor<3x2x1xf32>
-  return %transposed : tensor<3x2x1xf32>
+  util.return %transposed : tensor<3x2x1xf32>
 }
-// CHECK-LABEL: func @specialize_transpose_op
+// CHECK-LABEL: util.func public @specialize_transpose_op
 //       CHECK:   %[[TRANSPOSE:.+]] = linalg.transpose
 //  CHECK-SAME:     permutation = [2, 1, 0]
-//       CHECK:   return %[[TRANSPOSE]]
+//       CHECK:   util.return %[[TRANSPOSE]]
 
 // -----
 
-func.func @specialize_non_involution_transpose_op(%arg0 : tensor<1x2x3xf32>,
+util.func public @specialize_non_involution_transpose_op(%arg0 : tensor<1x2x3xf32>,
                                    %empty : tensor<2x3x1xf32>) -> tensor<2x3x1xf32> {
   %transposed = linalg.generic {indexing_maps = [
                     affine_map<(d0, d1, d2) -> (d2, d0, d1)>,
                     affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-                iterator_types = ["parallel", "parallel", "parallel"]} 
+                iterator_types = ["parallel", "parallel", "parallel"]}
                 ins(%arg0 : tensor<1x2x3xf32>)
                 outs(%empty : tensor<2x3x1xf32>) {
                   ^bb0(%in: f32, %out: f32):
                     linalg.yield %in : f32
                   } -> tensor<2x3x1xf32>
-  return %transposed : tensor<2x3x1xf32>
+  util.return %transposed : tensor<2x3x1xf32>
 }
-// CHECK-LABEL: func @specialize_non_involution_transpose_op
+// CHECK-LABEL: util.func public @specialize_non_involution_transpose_op
 //       CHECK:   %[[TRANSPOSE:.+]] = linalg.transpose
 //  CHECK-SAME:     permutation = [1, 2, 0]
-//       CHECK:   return %[[TRANSPOSE]]
+//       CHECK:   util.return %[[TRANSPOSE]]
 
 // -----
 
-func.func @propagate_through_extract_slice(%arg0 : tensor<1x256x128xf32>) -> tensor<1x128x32xf32> {
+util.func public @propagate_through_extract_slice(%arg0 : tensor<1x256x128xf32>) -> tensor<1x128x32xf32> {
   %empty = tensor.empty(): tensor<1x128x256xf32>
   %transposed = linalg.transpose ins(%arg0 : tensor<1x256x128xf32>)
       outs(%empty : tensor<1x128x256xf32>) permutation = [0, 2, 1]
   %slice = tensor.extract_slice %transposed[0, 0, 0] [1, 128, 32] [1, 1, 1] : tensor<1x128x256xf32> to tensor<1x128x32xf32>
-  return %slice : tensor<1x128x32xf32>
+  util.return %slice : tensor<1x128x32xf32>
 }
-// CHECK-LABEL: func @propagate_through_extract_slice
+// CHECK-LABEL: util.func public @propagate_through_extract_slice
 //       CHECK:   %[[SLICE:.+]] = tensor.extract_slice {{.*}}[0, 0, 0] [1, 32, 128] [1, 1, 1]
 //  CHECK-SAME:                     tensor<1x256x128xf32> to tensor<1x32x128xf32>
 //       CHECK:   %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[SLICE]] : tensor<1x32x128xf32>)
 //  CHECK-SAME:     permutation = [0, 2, 1]
-//       CHECK:   return %[[TRANSPOSE]]
+//       CHECK:   util.return %[[TRANSPOSE]]
 
 // -----
 
-func.func @propagate_through_rank_reduced_extract_slice(%arg0 : tensor<1x256x1x128x1xf32>) -> tensor<128x32xf32> {
+util.func public @propagate_through_rank_reduced_extract_slice(%arg0 : tensor<1x256x1x128x1xf32>) -> tensor<128x32xf32> {
   %empty = tensor.empty(): tensor<1x128x1x256x1xf32>
   %transposed = linalg.transpose ins(%arg0 : tensor<1x256x1x128x1xf32>)
       outs(%empty : tensor<1x128x1x256x1xf32>) permutation = [0, 3, 2, 1, 4]
   %slice = tensor.extract_slice %transposed[0, 0, 0, 0, 0] [1, 128, 1, 32, 1] [1, 1, 1, 1, 1]
              : tensor<1x128x1x256x1xf32> to tensor<128x32xf32>
-  return %slice : tensor<128x32xf32>
+  util.return %slice : tensor<128x32xf32>
 }
-// CHECK-LABEL: func @propagate_through_rank_reduced_extract_slice
+// CHECK-LABEL: util.func public @propagate_through_rank_reduced_extract_slice
 //       CHECK:   %[[SLICE:.+]] = tensor.extract_slice
 //  CHECK-SAME:                     [0, 0, 0, 0, 0] [1, 32, 1, 128, 1] [1, 1, 1, 1, 1]
 //  CHECK-SAME:                     tensor<1x256x1x128x1xf32> to tensor<32x128xf32>
 //       CHECK:   %[[TRANSPOSE:.+]] = linalg.transpose ins(%[[SLICE]] : tensor<32x128xf32>)
 //  CHECK-SAME:     permutation = [1, 0]
-//       CHECK:   return %[[TRANSPOSE]]
+//       CHECK:   util.return %[[TRANSPOSE]]
 
 // -----
 
-func.func @rank_reduced_extract_transposed_unit_dim(%arg0: tensor<256x1x32x128xf32>, %arg1: tensor<1x32x256x128xf32>) -> tensor<32x64x128xf32> {
-  %transposed = linalg.transpose ins(%arg0 : tensor<256x1x32x128xf32>) outs(%arg1 : tensor<1x32x256x128xf32>) permutation = [1, 2, 0, 3] 
+util.func public @rank_reduced_extract_transposed_unit_dim(%arg0: tensor<256x1x32x128xf32>, %arg1: tensor<1x32x256x128xf32>) -> tensor<32x64x128xf32> {
+  %transposed = linalg.transpose ins(%arg0 : tensor<256x1x32x128xf32>) outs(%arg1 : tensor<1x32x256x128xf32>) permutation = [1, 2, 0, 3]
   %extracted_slice = tensor.extract_slice %transposed[0, 0, 0, 0] [1, 32, 64, 128] [1, 1, 1, 1] : tensor<1x32x256x128xf32> to tensor<32x64x128xf32>
-  return %extracted_slice : tensor<32x64x128xf32>
+  util.return %extracted_slice : tensor<32x64x128xf32>
 }
-// SINK-LABEL: func @rank_reduced_extract_transposed_unit_dim
+// SINK-LABEL: util.func public @rank_reduced_extract_transposed_unit_dim
 //       SINK:   %[[EXT:.+]] = tensor.extract_slice
 //  SINK-SAME:                   tensor<256x1x32x128xf32> to tensor<64x32x128xf32>
 //       SINK:   %[[RES:.+]] = linalg.transpose ins(%[[EXT]] : tensor<64x32x128xf32>
 //  SINK-SAME:                    outs({{.*}} : tensor<32x64x128xf32>)
 //  SINK-SAME:                    permutation = [1, 0, 2]
-//       SINK:   return %[[RES]] : tensor<32x64x128xf32>
+//       SINK:   util.return %[[RES]] : tensor<32x64x128xf32>
 
 // -----
 
-func.func @propagate_to_matmul_ops(%lhs: tensor<16x16xf32>,
+util.func public @propagate_to_matmul_ops(%lhs: tensor<16x16xf32>,
                                    %transposed_a: tensor<16x16xf32>,
                                    %transposed_b: tensor<16x16xf32>) -> tensor<16x16xf32> {
   %empty = tensor.empty(): tensor<16x16xf32>
@@ -104,16 +104,16 @@
       outs(%empty : tensor<16x16xf32>) permutation = [1, 0]
   %second_mm = linalg.matmul ins(%second_lhs, %first_mm : tensor<16x16xf32>, tensor<16x16xf32>)
                             outs(%empty : tensor<16x16xf32>) -> tensor<16x16xf32>
-  return %second_mm : tensor<16x16xf32>
+  util.return %second_mm : tensor<16x16xf32>
 }
-// CHECK-LABEL: func @propagate_to_matmul_ops
+// CHECK-LABEL: util.func public @propagate_to_matmul_ops
 //       CHECK:   linalg.matmul_transpose_b
 //       CHECK:   %[[SECOND_MM:.+]] = linalg.matmul_transpose_a
-//       CHECK:   return %[[SECOND_MM]]
+//       CHECK:   util.return %[[SECOND_MM]]
 
 // -----
 
-func.func @propagate_to_transposed_matmul_ops(%lhs: tensor<16x16xf32>,
+util.func public @propagate_to_transposed_matmul_ops(%lhs: tensor<16x16xf32>,
                                               %second_lhs: tensor<16x16xf32>,
                                               %rhs: tensor<16x16xf32>) -> tensor<16x16xf32> {
   %empty = tensor.empty(): tensor<16x16xf32>
@@ -126,16 +126,16 @@
       outs(%empty : tensor<16x16xf32>) permutation = [1, 0]
   %second_mm = linalg.matmul_transpose_a ins(%transpose_a, %first_mm : tensor<16x16xf32>, tensor<16x16xf32>)
                                          outs(%empty : tensor<16x16xf32>) -> tensor<16x16xf32>
-  return %second_mm : tensor<16x16xf32>
+  util.return %second_mm : tensor<16x16xf32>
 }
-// CHECK-LABEL: func @propagate_to_transposed_matmul_ops
+// CHECK-LABEL: util.func public @propagate_to_transposed_matmul_ops
 //       CHECK:   linalg.matmul ins
 //       CHECK:   %[[SECOND_MM:.+]] = linalg.matmul ins
-//       CHECK:   return %[[SECOND_MM]]
+//       CHECK:   util.return %[[SECOND_MM]]
 
 // -----
 
-func.func @propagate_to_bmm_ops(%lhs: tensor<2x16x16xf32>,
+util.func public @propagate_to_bmm_ops(%lhs: tensor<2x16x16xf32>,
                                    %transposed_a: tensor<2x16x16xf32>,
                                    %transposed_b: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %empty = tensor.empty(): tensor<2x16x16xf32>
@@ -148,16 +148,16 @@
       outs(%empty : tensor<2x16x16xf32>) permutation = [0, 2, 1]
   %second_bmm = linalg.batch_matmul ins(%second_lhs, %first_bmm : tensor<2x16x16xf32>, tensor<2x16x16xf32>)
                             outs(%empty : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
-  return %second_bmm : tensor<2x16x16xf32>
+  util.return %second_bmm : tensor<2x16x16xf32>
 }
-// CHECK-LABEL: func @propagate_to_bmm_ops
+// CHECK-LABEL: util.func public @propagate_to_bmm_ops
 //       CHECK:   linalg.batch_matmul_transpose_b
 //       CHECK:   %[[SECOND_MM:.+]] = linalg.batch_matmul_transpose_a
-//       CHECK:   return %[[SECOND_MM]]
+//       CHECK:   util.return %[[SECOND_MM]]
 
 // -----
 
-func.func @propagate_to_transposed_bmm_ops(%lhs: tensor<2x16x16xf32>,
+util.func public @propagate_to_transposed_bmm_ops(%lhs: tensor<2x16x16xf32>,
                                               %second_lhs: tensor<2x16x16xf32>,
                                               %rhs: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %empty = tensor.empty(): tensor<2x16x16xf32>
@@ -170,16 +170,16 @@
       outs(%empty : tensor<2x16x16xf32>) permutation = [0, 2, 1]
   %second_bmm = linalg.batch_matmul_transpose_a ins(%transpose_a, %first_bmm : tensor<2x16x16xf32>, tensor<2x16x16xf32>)
                                          outs(%empty : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
-  return %second_bmm : tensor<2x16x16xf32>
+  util.return %second_bmm : tensor<2x16x16xf32>
 }
-// CHECK-LABEL: func @propagate_to_transposed_bmm_ops
+// CHECK-LABEL: util.func public @propagate_to_transposed_bmm_ops
 //       CHECK:   linalg.batch_matmul ins
 //       CHECK:   %[[SECOND_MM:.+]] = linalg.batch_matmul ins
-//       CHECK:   return %[[SECOND_MM]]
+//       CHECK:   util.return %[[SECOND_MM]]
 
 // -----
 
-func.func @do_not_propagate_to_matmul_in_dispatch(%lhs: tensor<16x16xf32>,
+util.func public @do_not_propagate_to_matmul_in_dispatch(%lhs: tensor<16x16xf32>,
                                                   %transposed_b: tensor<16x16xf32>) -> tensor<16x16xf32> {
   %empty = tensor.empty(): tensor<16x16xf32>
   %rhs = linalg.transpose ins(%transposed_b : tensor<16x16xf32>)
@@ -189,45 +189,45 @@
                               outs(%empty : tensor<16x16xf32>) -> tensor<16x16xf32>
     flow.return %mm : tensor<16x16xf32>
   }
-  return %dispatch : tensor<16x16xf32>
+  util.return %dispatch : tensor<16x16xf32>
 }
-// CHECK-LABEL: func @do_not_propagate_to_matmul_in_dispatch
+// CHECK-LABEL: util.func public @do_not_propagate_to_matmul_in_dispatch
 //       CHECK:   linalg.transpose
 //       CHECK:   %[[DISPATCH:.+]] = flow.dispatch.region
 //       CHECK:     linalg.matmul ins
-//       CHECK:   return %[[DISPATCH]]
+//       CHECK:   util.return %[[DISPATCH]]
 
 // -----
 
-func.func @propagate_to_bmm_transpose_batch(%transposed_lhs: tensor<16x2x16xf32>,
+util.func public @propagate_to_bmm_transpose_batch(%transposed_lhs: tensor<16x2x16xf32>,
                                             %rhs: tensor<2x16x16xf32>) -> tensor<2x16x16xf32> {
   %empty = tensor.empty(): tensor<2x16x16xf32>
   %lhs = linalg.transpose ins(%transposed_lhs : tensor<16x2x16xf32>)
       outs(%empty : tensor<2x16x16xf32>) permutation = [1, 0, 2]
   %bmm = linalg.batch_matmul ins(%lhs, %rhs : tensor<2x16x16xf32>, tensor<2x16x16xf32>)
                             outs(%empty : tensor<2x16x16xf32>) -> tensor<2x16x16xf32>
-  return %bmm : tensor<2x16x16xf32>
+  util.return %bmm : tensor<2x16x16xf32>
 }
 // Verify that without aggressive propagation, this stays as a batch matmul
-// CHECK-LABEL: func @propagate_to_bmm_transpose_batch
+// CHECK-LABEL: util.func public @propagate_to_bmm_transpose_batch
 //       CHECK:   linalg.batch_matmul
 
 // APROP: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>
 // APROP: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 // APROP: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
 
-// APROP-LABEL: func @propagate_to_bmm_transpose_batch
+// APROP-LABEL: util.func public @propagate_to_bmm_transpose_batch
 //  APROP-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<16x2x16xf32>
 //  APROP-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<2x16x16xf32>
 //       APROP:   %[[GENERIC:.+]] = linalg.generic
 //  APROP-SAME:     indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
 //  APROP-SAME:     iterator_types = ["parallel", "parallel", "parallel", "reduction"]
 //  APROP-SAME:     ins(%[[ARG0]], %[[ARG1]] : tensor<16x2x16xf32>, tensor<2x16x16xf32>
-//       APROP:   return %[[GENERIC]] : tensor<2x16x16xf32>
+//       APROP:   util.return %[[GENERIC]] : tensor<2x16x16xf32>
 
 // -----
 
-func.func @sink_through_expand_shape(%arg0 : tensor<?x?x?xf32>) -> tensor<32x?x16x?x?xf32> {
+util.func public @sink_through_expand_shape(%arg0 : tensor<?x?x?xf32>) -> tensor<32x?x16x?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -238,29 +238,29 @@
   %transposed = linalg.transpose ins(%arg0 : tensor<?x?x?xf32>)
       outs(%empty : tensor<?x?x?xf32>) permutation = [1, 0, 2]
   %expanded = tensor.expand_shape %transposed [[0, 1], [2, 3], [4]] : tensor<?x?x?xf32> into tensor<32x?x16x?x?xf32>
-  return %expanded : tensor<32x?x16x?x?xf32>
+  util.return %expanded : tensor<32x?x16x?x?xf32>
 }
-// SINK-LABEL: func @sink_through_expand_shape
+// SINK-LABEL: util.func public @sink_through_expand_shape
 //       SINK:   %[[EXP:.+]] = tensor.expand_shape {{.*}} {{\[\[}}0, 1], [2, 3], [4]]
 //  SINK-SAME:                   tensor<?x?x?xf32> into tensor<16x?x32x?x?xf32>
 //       SINK:   %[[RES:.+]] = linalg.transpose ins(%[[EXP]] : tensor<16x?x32x?x?xf32>
 //  SINK-SAME:                    outs({{.*}} : tensor<32x?x16x?x?xf32>)
 //  SINK-SAME:                    permutation = [2, 3, 0, 1, 4]
-//       SINK:   return %[[RES]] : tensor<32x?x16x?x?xf32>
+//       SINK:   util.return %[[RES]] : tensor<32x?x16x?x?xf32>
 
 // -----
 
-func.func @sink_non_involution_through_expand_shape(%arg0 : tensor<2x3x4xf32>) -> tensor<1x3x4x2xf32> {
+util.func public @sink_non_involution_through_expand_shape(%arg0 : tensor<2x3x4xf32>) -> tensor<1x3x4x2xf32> {
   %empty = tensor.empty(): tensor<3x4x2xf32>
   %transposed = linalg.transpose ins(%arg0 : tensor<2x3x4xf32>)
       outs(%empty : tensor<3x4x2xf32>) permutation = [1, 2, 0]
   %expanded = tensor.expand_shape %transposed [[0, 1], [2], [3]] : tensor<3x4x2xf32> into tensor<1x3x4x2xf32>
-  return %expanded : tensor<1x3x4x2xf32>
+  util.return %expanded : tensor<1x3x4x2xf32>
 }
-// SINK-LABEL: func @sink_non_involution_through_expand_shape
+// SINK-LABEL: util.func public @sink_non_involution_through_expand_shape
 //       SINK:   %[[EXP:.+]] = tensor.expand_shape {{.*}} {{\[\[}}0], [1, 2], [3]]
 //  SINK-SAME:                   tensor<2x3x4xf32> into tensor<2x1x3x4xf32>
 //       SINK:   %[[RES:.+]] = linalg.transpose ins(%[[EXP]] : tensor<2x1x3x4xf32>
 //  SINK-SAME:                    outs({{.*}} : tensor<1x3x4x2xf32>)
 //  SINK-SAME:                    permutation = [1, 2, 3, 0]
-//       SINK:   return %[[RES]] : tensor<1x3x4x2xf32>
+//       SINK:   util.return %[[RES]] : tensor<1x3x4x2xf32>

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/raise_special_ops.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/raise_special_ops.mlir
index 48f9ddc..132a08b 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/raise_special_ops.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/raise_special_ops.mlir

@@ -4,9 +4,9 @@
 //  CHECK-SAME: %[[ARG:.+]]: tensor<?x?x?xf32>
 //       CHECK:   %[[E:.+]] = tensor.empty(%{{.*}}, %{{.*}}, %{{.*}}) : tensor<?x?x?xf32>
 //       CHECK:   %[[S:.+]] = linalg.softmax dimension(2) ins(%[[ARG]] : tensor<?x?x?xf32>) outs(%[[E]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-//       CHECK:   return %[[S]] : tensor<?x?x?xf32>
+//       CHECK:   util.return %[[S]] : tensor<?x?x?xf32>
 
-func.func @softmax(%src : tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>) {
+util.func public @softmax(%src : tensor<?x?x?xf32>) -> (tensor<?x?x?xf32>) {
   %cst = arith.constant 1.000000e+00 : f32
   %cst_0 = arith.constant 0.000000e+00 : f32
   %cst_1 = arith.constant -3.40282347E+38 : f32
@@ -50,15 +50,17 @@
     %11 = arith.mulf %arg0, %arg1 : f32
     linalg.yield %11 : f32
   } -> tensor<?x?x?xf32>
-  return %10 : tensor<?x?x?xf32>
+  util.return %10 : tensor<?x?x?xf32>
 }
 
+// -----
+
 // CHECK-LABEL: @softmax_no_rcp
 //  CHECK-SAME: %[[ARG:.+]]: tensor<10x4096x4096xf16>
 //       CHECK:   %[[E:.+]] = tensor.empty() : tensor<10x4096x4096xf16>
 //       CHECK:   %[[S:.+]] = linalg.softmax dimension(2) ins(%[[ARG]] : tensor<10x4096x4096xf16>) outs(%[[E]] : tensor<10x4096x4096xf16>) -> tensor<10x4096x4096xf16>
-//       CHECK:   return %[[S]] : tensor<10x4096x4096xf16>
-func.func @softmax_no_rcp(%src : tensor<10x4096x4096xf16>) -> (tensor<10x4096x4096xf16>) {
+//       CHECK:   util.return %[[S]] : tensor<10x4096x4096xf16>
+util.func public @softmax_no_rcp(%src : tensor<10x4096x4096xf16>) -> (tensor<10x4096x4096xf16>) {
   %cst_158 = arith.constant -6.550400e+04 : f16
   %cst_121 = arith.constant 0.000000e+00 : f16
   %224 = tensor.empty() : tensor<10x4096xf16>
@@ -106,16 +108,17 @@
     %5290 = arith.divf %in, %in_1572 : f16
     linalg.yield %5290 : f16
   } -> tensor<10x4096x4096xf16>
-  return %232 : tensor<10x4096x4096xf16>
+  util.return %232 : tensor<10x4096x4096xf16>
 }
 
+// -----
 
 // CHECK-LABEL: @softmax_broadcast
 //  CHECK-SAME: %[[ARG:.+]]: tensor<12x128x128xf32>
 //       CHECK:   %[[E:.+]] = tensor.empty() : tensor<12x128x128xf32>
 //       CHECK:   %[[S:.+]] = linalg.softmax dimension(2) ins(%[[ARG]] : tensor<12x128x128xf32>) outs(%[[E]] : tensor<12x128x128xf32>) -> tensor<12x128x128xf32>
-//       CHECK:   return %[[S]] : tensor<12x128x128xf32>
-func.func @softmax_broadcast(%93 : tensor<12x128x128xf32>) -> (tensor<12x128x128xf32>) {
+//       CHECK:   util.return %[[S]] : tensor<12x128x128xf32>
+util.func public @softmax_broadcast(%93 : tensor<12x128x128xf32>) -> (tensor<12x128x128xf32>) {
   %cst_16 = arith.constant 0xFF800000 : f32
   %cst_18 = arith.constant -0.000000e+00 : f32
   %94 = tensor.empty() : tensor<12x128xf32>
@@ -160,10 +163,12 @@
     %2460 = arith.divf %in, %in_261 : f32
     linalg.yield %2460 : f32
   } -> tensor<12x128x128xf32>
-  return %109 : tensor<12x128x128xf32>
+  util.return %109 : tensor<12x128x128xf32>
 }
 
-func.func @aTransposeBMatmul(%arg0 : tensor<10x20xf32>,
+// -----
+
+util.func public @aTransposeBMatmul(%arg0 : tensor<10x20xf32>,
     %arg1 : tensor<40x20xf32>) -> tensor<10x40xf32> {
   %0 = tensor.empty() : tensor<20x40xf32>
   %1 = linalg.generic {
@@ -178,16 +183,18 @@
   %4 = linalg.fill ins(%3 : f32) outs(%2 : tensor<10x40xf32>) -> tensor<10x40xf32>
   %5 = linalg.matmul ins(%arg0, %1 : tensor<10x20xf32>, tensor<20x40xf32>)
       outs(%4 : tensor<10x40xf32>) -> tensor<10x40xf32>
-  return %5 : tensor<10x40xf32>
+  util.return %5 : tensor<10x40xf32>
 }
-// CHECK-LABEL: func @aTransposeBMatmul
+// CHECK-LABEL: util.func public @aTransposeBMatmul
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<10x20xf32>
 //  CHECK-SAME:     %[[ARG1:.+]]: tensor<40x20xf32>
 //       CHECK:   %[[RESULT:.+]] = linalg.matmul_transpose_b
 //  CHECK-SAME:       ins(%[[ARG0]], %[[ARG1]] :
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
-func.func @aTransposeBBatchMatmul(%arg0 : tensor<5x10x20xf32>,
+// -----
+
+util.func public @aTransposeBBatchMatmul(%arg0 : tensor<5x10x20xf32>,
     %arg1 : tensor<5x40x20xf32>) -> tensor<5x10x40xf32> {
   %0 = tensor.empty() : tensor<5x20x40xf32>
   %1 = linalg.generic {
@@ -202,16 +209,18 @@
   %4 = linalg.fill ins(%3 : f32) outs(%2 : tensor<5x10x40xf32>) -> tensor<5x10x40xf32>
   %5 = linalg.batch_matmul ins(%arg0, %1 : tensor<5x10x20xf32>, tensor<5x20x40xf32>)
       outs(%4 : tensor<5x10x40xf32>) -> tensor<5x10x40xf32>
-  return %5 : tensor<5x10x40xf32>
+  util.return %5 : tensor<5x10x40xf32>
 }
-// CHECK-LABEL: func @aTransposeBBatchMatmul
+// CHECK-LABEL: util.func public @aTransposeBBatchMatmul
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<5x10x20xf32>
 //  CHECK-SAME:     %[[ARG1:.+]]: tensor<5x40x20xf32>
 //       CHECK:   %[[RESULT:.+]] = linalg.batch_matmul_transpose_b
 //  CHECK-SAME:       ins(%[[ARG0]], %[[ARG1]] :
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
-func.func @generic_fill(%arg0: tensor<?x?xf32>) -> tensor<1x1x?x?xf32> {
+// -----
+
+util.func public @generic_fill(%arg0: tensor<?x?xf32>) -> tensor<1x1x?x?xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
@@ -225,9 +234,9 @@
       ^bb0(%out: f32):
         linalg.yield %cst : f32
     } -> tensor<1x1x?x?xf32>
-    return %1 : tensor<1x1x?x?xf32>
+    util.return %1 : tensor<1x1x?x?xf32>
 }
-// CHECK-LABEL: func @generic_fill
+// CHECK-LABEL: util.func public @generic_fill
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>
 //       CHECK:   %[[CST:.+]] = arith.constant 0.000000e+00 : f32
 //       CHECK:   %[[EMPTY:.+]] = tensor.empty
@@ -235,12 +244,12 @@
 //       CHECK:   %[[RESULT:.+]] = linalg.fill
 //  CHECK-SAME:       ins(%[[CST]] : f32)
 //  CHECK-SAME:       outs(%[[EMPTY]] : tensor<1x1x?x?xf32>)
-//       CHECK:   return %[[RESULT]]
+//       CHECK:   util.return %[[RESULT]]
 
 // -----
 
 #map = affine_map<(d0) -> (d0)>
-func.func @test_rank_reduce(%A : tensor<1x1x5120xf32>, %B : tensor<5120xf32>) -> tensor<5120xf32> {
+util.func public @test_rank_reduce(%A : tensor<1x1x5120xf32>, %B : tensor<5120xf32>) -> tensor<5120xf32> {
   %c0 = arith.constant 0 : index
   %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%B : tensor<5120xf32>) {
   ^bb0(%out: f32):
@@ -248,17 +257,17 @@
     %extracted = tensor.extract %A[%c0, %c0, %12] : tensor<1x1x5120xf32>
     linalg.yield %extracted : f32
   } -> tensor<5120xf32>
-  return %0 : tensor<5120xf32>
+  util.return %0 : tensor<5120xf32>
 }
 
-// CHECK-LABEL: func @test_rank_reduce
+// CHECK-LABEL: util.func public @test_rank_reduce
 //       CHECK:   tensor.extract_slice %{{.*}}[0, 0, 0] [1, 1, 5120] [1, 1, 1]
 //  CHECK-SAME:     tensor<1x1x5120xf32> to tensor<5120xf32>
 
 // -----
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @test_slice_middle(%A : tensor<64x64x64xf32>, %B : tensor<64x64xf32>) -> tensor<64x64xf32> {
+util.func public @test_slice_middle(%A : tensor<64x64x64xf32>, %B : tensor<64x64xf32>) -> tensor<64x64xf32> {
   %c0 = arith.constant 0 : index
   %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%B : tensor<64x64xf32>) {
   ^bb0(%out: f32):
@@ -267,16 +276,16 @@
     %extracted = tensor.extract %A[%i1, %c0, %i2] : tensor<64x64x64xf32>
     linalg.yield %extracted : f32
   } -> tensor<64x64xf32>
-  return %0 : tensor<64x64xf32>
+  util.return %0 : tensor<64x64xf32>
 }
 
-// CHECK-LABEL: func @test_slice_middle
+// CHECK-LABEL: util.func public @test_slice_middle
 //       CHECK:   tensor.extract_slice %{{.*}}[0, 0, 0] [64, 1, 64] [1, 1, 1]
 //  CHECK-SAME:     tensor<64x64x64xf32> to tensor<64x64xf32>
 
 // -----
 
-func.func @test_trailing_elementwise(%arg0: tensor<180x320x1xf32>) -> tensor<320xf32> {
+util.func public @test_trailing_elementwise(%arg0: tensor<180x320x1xf32>) -> tensor<320xf32> {
   %c0 = arith.constant 0 : index
   %c179 = arith.constant 179 : index
   %70 = tensor.empty() : tensor<320xf32>
@@ -286,10 +295,10 @@
     %extracted = tensor.extract %arg0[%c0, %76, %c0] : tensor<180x320x1xf32>
     linalg.yield %extracted : f32
   } -> tensor<320xf32>
-  return %71 : tensor<320xf32>
+  util.return %71 : tensor<320xf32>
 }
 
-// CHECK-LABEL: func @test_trailing_elementwise
+// CHECK-LABEL: util.func public @test_trailing_elementwise
 //       CHECK:   tensor.extract_slice %{{.*}}[0, 0, 0] [1, 320, 1] [1, 1, 1]
 //  CHECK-SAME:     tensor<180x320x1xf32> to tensor<320xf32>
 
@@ -298,8 +307,8 @@
 // This currently should not be raised as the operation does not remain
 // elementwise after raising the tensor.extract to input.
 #map = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @test_non_slice
-func.func @test_non_slice(%A : tensor<128x128x128xf32>, %B : tensor<64x64xf32>) -> tensor<64x64xf32> {
+// CHECK-LABEL: util.func public @test_non_slice
+util.func public @test_non_slice(%A : tensor<128x128x128xf32>, %B : tensor<64x64xf32>) -> tensor<64x64xf32> {
   %c0 = arith.constant 0 : index
   // CHECK: linalg.generic
   %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%B : tensor<64x64xf32>) {
@@ -309,13 +318,13 @@
     %extracted = tensor.extract %A[%i1, %c0, %i2] : tensor<128x128x128xf32>
     linalg.yield %extracted : f32
   } -> tensor<64x64xf32>
-  return %0 : tensor<64x64xf32>
+  util.return %0 : tensor<64x64xf32>
 }
 
 // -----
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @test_slice_negate_cat_peephole(%arg0: tensor<1x32x1x128xf16>) -> tensor<1x32x1x128xf16> {
+util.func public @test_slice_negate_cat_peephole(%arg0: tensor<1x32x1x128xf16>) -> tensor<1x32x1x128xf16> {
   %1 = tensor.empty() : tensor<1x32x1x128xf16>
   %2 = tensor.empty() : tensor<32x64xf16>
   %extracted_slice = tensor.extract_slice %arg0[0, 0, 0, 0] [1, 32, 1, 64] [1, 1, 1, 1] : tensor<1x32x1x128xf16> to tensor<32x64xf16>
@@ -327,10 +336,10 @@
   } -> tensor<32x64xf16>
   %inserted_slice = tensor.insert_slice %3 into %1[0, 0, 0, 0] [1, 32, 1, 64] [1, 1, 1, 1] : tensor<32x64xf16> into tensor<1x32x1x128xf16>
   %inserted_slice_1 = tensor.insert_slice %extracted_slice into %inserted_slice[0, 0, 0, 64] [1, 32, 1, 64] [1, 1, 1, 1] : tensor<32x64xf16> into tensor<1x32x1x128xf16>
-  return %inserted_slice_1 : tensor<1x32x1x128xf16>
+  util.return %inserted_slice_1 : tensor<1x32x1x128xf16>
 }
 
-// CHECK-LABEL: func.func @test_slice_negate_cat_peephole
+// CHECK-LABEL: util.func public @test_slice_negate_cat_peephole
 //  CHECK-SAME:     %[[ARG0:.+]]: tensor<1x32x1x128xf16>
 //       CHECK:   %[[C1:.+]] = arith.constant 1 : index
 //       CHECK:   %[[EXPIN:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1], [2], [3, 4]] : tensor<1x32x1x128xf16> into tensor<1x32x1x2x64xf16>
@@ -350,12 +359,12 @@
 //       CHECK:      linalg.yield %[[SEL]] : f16
 
 //       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape %[[NREV]] {{\[\[}}0], [1], [2], [3, 4]] : tensor<1x32x1x2x64xf16> into tensor<1x32x1x128xf16>
-//       CHECK:   return %[[COLLAPSE]]
+//       CHECK:   util.return %[[COLLAPSE]]
 
 // -----
 
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-func.func @test_slice_negate_cat_peephole_dynamic(%arg0: tensor<1x32x?x128xf16>) -> tensor<1x32x?x128xf16> {
+util.func public @test_slice_negate_cat_peephole_dynamic(%arg0: tensor<1x32x?x128xf16>) -> tensor<1x32x?x128xf16> {
   %c2 = arith.constant 2 : index
   %d2 = tensor.dim %arg0, %c2 : tensor<1x32x?x128xf16>
   %1 = tensor.empty(%d2) : tensor<1x32x?x128xf16>
@@ -369,21 +378,21 @@
   } -> tensor<32x?x64xf16>
   %inserted_slice = tensor.insert_slice %3 into %1[0, 0, 0, 0] [1, 32, %d2, 64] [1, 1, 1, 1] : tensor<32x?x64xf16> into tensor<1x32x?x128xf16>
   %inserted_slice_1 = tensor.insert_slice %extracted_slice into %inserted_slice[0, 0, 0, 64] [1, 32, %d2, 64] [1, 1, 1, 1] : tensor<32x?x64xf16> into tensor<1x32x?x128xf16>
-  return %inserted_slice_1 : tensor<1x32x?x128xf16>
+  util.return %inserted_slice_1 : tensor<1x32x?x128xf16>
 }
 
 /// Verify that the pattern kicks in for a simple dynamic example.
-// CHECK-LABEL: func.func @test_slice_negate_cat_peephole_dynamic
+// CHECK-LABEL: util.func public @test_slice_negate_cat_peephole_dynamic
 //       CHECK:    tensor.expand_shape
 //       CHECK:    linalg.generic
 //       CHECK:      tensor.extract
 //       CHECK:    %[[COL:.+]] = tensor.collapse_shape
-//       CHECK:    return %[[COL]]
+//       CHECK:    util.return %[[COL]]
 
 // -----
 
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-func.func @test_slice_negate_cat_peephole_dynamic(%arg0: tensor<32x?x128xf16>) -> tensor<32x?x128xf16> {
+util.func public @test_slice_negate_cat_peephole_dynamic(%arg0: tensor<32x?x128xf16>) -> tensor<32x?x128xf16> {
   %c2 = arith.constant 2 : index
   %d2 = tensor.dim %arg0, %c2 : tensor<32x?x128xf16>
   %1 = tensor.empty(%d2) : tensor<32x?x128xf16>
@@ -396,13 +405,13 @@
     linalg.yield %5 : f16
   } -> tensor<32x?x64xf16>
   %concat = tensor.concat dim(2) %3, %extracted_slice : (tensor<32x?x64xf16>, tensor<32x?x64xf16>) -> tensor<32x?x128xf16>
-  return %concat : tensor<32x?x128xf16>
+  util.return %concat : tensor<32x?x128xf16>
 }
 
 /// Verify that the pattern kicks in for tensor.concat as well.
-// CHECK-LABEL: func.func @test_slice_negate_cat_peephole_dynamic
+// CHECK-LABEL: util.func public @test_slice_negate_cat_peephole_dynamic
 //       CHECK:    tensor.expand_shape
 //       CHECK:    linalg.generic
 //       CHECK:      tensor.extract
 //       CHECK:    %[[COL:.+]] = tensor.collapse_shape
-//       CHECK:    return %[[COL]]
+//       CHECK:    util.return %[[COL]]

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/remove_zero_extent_tensors.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/remove_zero_extent_tensors.mlir
index 289f372..ca768a2 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/remove_zero_extent_tensors.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/remove_zero_extent_tensors.mlir

@@ -1,38 +1,38 @@
 // RUN: iree-opt --split-input-file --allow-unregistered-dialect \
-// RUN:          --pass-pipeline="builtin.module(func.func(iree-global-opt-remove-zero-extent-tensors))" \
+// RUN:          --pass-pipeline="builtin.module(util.func(iree-global-opt-remove-zero-extent-tensors))" \
 // RUN:          %s | FileCheck %s
 
-func.func @zero_sized_operands(%arg0 : tensor<?x0xf32>, %arg1 : index) -> tensor<?x?xf32> {
+util.func public @zero_sized_operands(%arg0 : tensor<?x0xf32>, %arg1 : index) -> tensor<?x?xf32> {
   %0 = tensor.empty(%arg1): tensor<0x?xf32>
   %1 = "some_op"(%arg0, %0) : (tensor<?x0xf32>, tensor<0x?xf32>) -> tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-//      CHECK: func @zero_sized_operands
+//      CHECK: util.func public @zero_sized_operands
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x0xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: index
 //      CHECK:   %[[EMPTY0:.+]] = tensor.empty(%[[ARG1]])
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]]
 //      CHECK:   %[[EMPTY1:.+]] = tensor.empty(%[[DIM]])
 //      CHECK:   %[[RESULT:.+]] = "some_op"(%[[EMPTY1]], %[[EMPTY0]]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @zero_sized_tensor_insert(%arg0 : tensor<?x?xf32>, %arg1 : tensor<0x?xf32>,
+util.func public @zero_sized_tensor_insert(%arg0 : tensor<?x?xf32>, %arg1 : tensor<0x?xf32>,
     %arg2 : index) -> tensor<?x?xf32> {
   %1 = tensor.insert_slice %arg1 into %arg0[0, 0] [0, %arg2] [1, 1] : tensor<0x?xf32> into tensor<?x?xf32>
-  return %1 : tensor<?x?xf32>
+  util.return %1 : tensor<?x?xf32>
 }
-// CHECK: func @zero_sized_tensor_insert(%[[ARG0:.+]]: tensor<?x?xf32>
-// CHECK:   return %[[ARG0]]
+// CHECK: util.func public @zero_sized_tensor_insert(%[[ARG0:.+]]: tensor<?x?xf32>
+// CHECK:   util.return %[[ARG0]]
 
 // -----
 
-func.func @zero_sizes_tensor_insert_dest(%arg0 : tensor<0x?xf32>, %arg1 : index) -> tensor<0x?xf32> {
+util.func public @zero_sizes_tensor_insert_dest(%arg0 : tensor<0x?xf32>, %arg1 : index) -> tensor<0x?xf32> {
   %0 = tensor.empty(%arg1) : tensor<0x?xf32>
   %1 = tensor.insert_slice %arg0 into %0[0, 0] [0, %arg1] [1, 1] : tensor<0x?xf32> into tensor<0x?xf32>
-  return %1 : tensor<0x?xf32>
+  util.return %1 : tensor<0x?xf32>
 }
-// CHECK: func @zero_sizes_tensor_insert_dest(%[[ARG0:.+]]: tensor<0x?xf32>, %[[ARG1:.+]]: index)
+// CHECK: util.func public @zero_sizes_tensor_insert_dest(%[[ARG0:.+]]: tensor<0x?xf32>, %[[ARG1:.+]]: index)
 // CHECK:   %[[EMPTY:.+]] = tensor.empty(%[[ARG1]])
-// CHECK:   return %[[EMPTY]]
+// CHECK:   util.return %[[EMPTY]]

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir
index 5ca9aac..2d95dcd 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir

@@ -1,16 +1,16 @@
 // RUN: iree-opt --iree-global-opt-set-encoding --cse --split-input-file %s | FileCheck %s
 
-func.func @matmul_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250x500xf32>,
+util.func public @matmul_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250x500xf32>,
     %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<100x250xf32>, tensor<250x500xf32>)
       outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32>
-  return %0 : tensor<100x500xf32>
+  util.return %0 : tensor<100x500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_f32f32f32(
+//      CHECK: util.func public @matmul_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<100x250xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250x500xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100x500xf32>
@@ -43,21 +43,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @matmul_f32f32f32_dynamic(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @matmul_f32f32f32_dynamic(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
       outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_f32f32f32_dynamic(
+//      CHECK: util.func public @matmul_f32f32f32_dynamic(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?xf32>, %[[ARG1:.+]]: tensor<?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>
 //  CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
@@ -93,20 +93,20 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [{{.*}}] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @matmul_i8i8i32(%arg0 : tensor<100x250xi8>, %arg1 : tensor<250x500xi8>,
+util.func public @matmul_i8i8i32(%arg0 : tensor<100x250xi8>, %arg1 : tensor<250x500xi8>,
     %arg2 : tensor<100x500xi32>) -> tensor<100x500xi32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<100x250xi8>, tensor<250x500xi8>)
       outs(%arg2 : tensor<100x500xi32>) -> tensor<100x500xi32>
-  return %0 : tensor<100x500xi32>
+  util.return %0 : tensor<100x500xi32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_i8i8i32(
+//      CHECK: util.func public @matmul_i8i8i32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<100x250xi8>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250x500xi8>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100x500xi32>
@@ -130,20 +130,20 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @matmul_f16f16f32(%arg0 : tensor<100x250xf16>, %arg1 : tensor<250x500xf16>,
+util.func public @matmul_f16f16f32(%arg0 : tensor<100x250xf16>, %arg1 : tensor<250x500xf16>,
     %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<100x250xf16>, tensor<250x500xf16>)
       outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32>
-  return %0 : tensor<100x500xf32>
+  util.return %0 : tensor<100x500xf32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_f16f16f32(
+//      CHECK: util.func public @matmul_f16f16f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<100x250xf16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250x500xf16>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100x500xf32>
@@ -167,20 +167,20 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @matmul_f16f16f16(%arg0 : tensor<100x250xf16>, %arg1 : tensor<250x500xf16>,
+util.func public @matmul_f16f16f16(%arg0 : tensor<100x250xf16>, %arg1 : tensor<250x500xf16>,
     %arg2 : tensor<100x500xf16>) -> tensor<100x500xf16> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<100x250xf16>, tensor<250x500xf16>)
       outs(%arg2 : tensor<100x500xf16>) -> tensor<100x500xf16>
-  return %0 : tensor<100x500xf16>
+  util.return %0 : tensor<100x500xf16>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_f16f16f16(
+//      CHECK: util.func public @matmul_f16f16f16(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<100x250xf16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250x500xf16>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100x500xf16>
@@ -204,20 +204,20 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @matmul_bf16bf16f32(%arg0 : tensor<100x250xbf16>, %arg1 : tensor<250x500xbf16>,
+util.func public @matmul_bf16bf16f32(%arg0 : tensor<100x250xbf16>, %arg1 : tensor<250x500xbf16>,
     %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<100x250xbf16>, tensor<250x500xbf16>)
       outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32>
-  return %0 : tensor<100x500xf32>
+  util.return %0 : tensor<100x500xf32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_bf16bf16f32(
+//      CHECK: util.func public @matmul_bf16bf16f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<100x250xbf16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250x500xbf16>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100x500xf32>
@@ -241,20 +241,20 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @matmul_bf16bf16bf16(%arg0 : tensor<100x250xbf16>, %arg1 : tensor<250x500xbf16>,
+util.func public @matmul_bf16bf16bf16(%arg0 : tensor<100x250xbf16>, %arg1 : tensor<250x500xbf16>,
     %arg2 : tensor<100x500xbf16>) -> tensor<100x500xbf16> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<100x250xbf16>, tensor<250x500xbf16>)
       outs(%arg2 : tensor<100x500xbf16>) -> tensor<100x500xbf16>
-  return %0 : tensor<100x500xbf16>
+  util.return %0 : tensor<100x500xbf16>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_bf16bf16bf16(
+//      CHECK: util.func public @matmul_bf16bf16bf16(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<100x250xbf16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250x500xbf16>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100x500xbf16>
@@ -278,21 +278,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_f32f32f32(%arg0 : tensor<64x100x250xf32>, %arg1 : tensor<64x250x500xf32>,
+util.func public @batch_matmul_f32f32f32(%arg0 : tensor<64x100x250xf32>, %arg1 : tensor<64x250x500xf32>,
     %arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x100x250xf32>, tensor<64x250x500xf32>)
       outs(%arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32>
-  return %0 : tensor<64x100x500xf32>
+  util.return %0 : tensor<64x100x500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_f32f32f32(
+//      CHECK: util.func public @batch_matmul_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<64x100x250xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<64x250x500xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<64x100x500xf32>
@@ -329,21 +329,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_f32f32f32_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?x?xf32>,
+util.func public @batch_matmul_f32f32f32_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?x?xf32>,
     %arg2 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
       outs(%arg2 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-  return %0 : tensor<?x?x?xf32>
+  util.return %0 : tensor<?x?x?xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_f32f32f32_dynamic(
+//      CHECK: util.func public @batch_matmul_f32f32f32_dynamic(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?xf32>, %[[ARG2:.+]]: tensor<?x?x?xf32>
 //  CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
@@ -386,21 +386,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [{{.*}}] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_f16f16f16(%arg0 : tensor<64x100x250xf16>, %arg1 : tensor<64x250x500xf16>,
+util.func public @batch_matmul_f16f16f16(%arg0 : tensor<64x100x250xf16>, %arg1 : tensor<64x250x500xf16>,
     %arg2 : tensor<64x100x500xf16>) -> tensor<64x100x500xf16> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x100x250xf16>, tensor<64x250x500xf16>)
       outs(%arg2 : tensor<64x100x500xf16>) -> tensor<64x100x500xf16>
-  return %0 : tensor<64x100x500xf16>
+  util.return %0 : tensor<64x100x500xf16>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_f16f16f16(
+//      CHECK: util.func public @batch_matmul_f16f16f16(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<64x100x250xf16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<64x250x500xf16>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<64x100x500xf16>
@@ -437,21 +437,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_f16f16f32(%arg0 : tensor<64x100x250xf16>, %arg1 : tensor<64x250x500xf16>,
+util.func public @batch_matmul_f16f16f32(%arg0 : tensor<64x100x250xf16>, %arg1 : tensor<64x250x500xf16>,
     %arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x100x250xf16>, tensor<64x250x500xf16>)
       outs(%arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32>
-  return %0 : tensor<64x100x500xf32>
+  util.return %0 : tensor<64x100x500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_f16f16f32(
+//      CHECK: util.func public @batch_matmul_f16f16f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<64x100x250xf16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<64x250x500xf16>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<64x100x500xf32>
@@ -488,21 +488,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_bf16bf16bf16(%arg0 : tensor<64x100x250xbf16>, %arg1 : tensor<64x250x500xbf16>,
+util.func public @batch_matmul_bf16bf16bf16(%arg0 : tensor<64x100x250xbf16>, %arg1 : tensor<64x250x500xbf16>,
     %arg2 : tensor<64x100x500xbf16>) -> tensor<64x100x500xbf16> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x100x250xbf16>, tensor<64x250x500xbf16>)
       outs(%arg2 : tensor<64x100x500xbf16>) -> tensor<64x100x500xbf16>
-  return %0 : tensor<64x100x500xbf16>
+  util.return %0 : tensor<64x100x500xbf16>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_bf16bf16bf16(
+//      CHECK: util.func public @batch_matmul_bf16bf16bf16(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<64x100x250xbf16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<64x250x500xbf16>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<64x100x500xbf16>
@@ -539,21 +539,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_bf16bf16f32(%arg0 : tensor<64x100x250xbf16>, %arg1 : tensor<64x250x500xbf16>,
+util.func public @batch_matmul_bf16bf16f32(%arg0 : tensor<64x100x250xbf16>, %arg1 : tensor<64x250x500xbf16>,
     %arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x100x250xbf16>, tensor<64x250x500xbf16>)
       outs(%arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32>
-  return %0 : tensor<64x100x500xf32>
+  util.return %0 : tensor<64x100x500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_bf16bf16f32(
+//      CHECK: util.func public @batch_matmul_bf16bf16f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<64x100x250xbf16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<64x250x500xbf16>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<64x100x500xf32>
@@ -590,21 +590,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_i8i8i32(%arg0 : tensor<64x100x250xi8>, %arg1 : tensor<64x250x500xi8>,
+util.func public @batch_matmul_i8i8i32(%arg0 : tensor<64x100x250xi8>, %arg1 : tensor<64x250x500xi8>,
     %arg2 : tensor<64x100x500xi32>) -> tensor<64x100x500xi32> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x100x250xi8>, tensor<64x250x500xi8>)
       outs(%arg2 : tensor<64x100x500xi32>) -> tensor<64x100x500xi32>
-  return %0 : tensor<64x100x500xi32>
+  util.return %0 : tensor<64x100x500xi32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_i8i8i32(
+//      CHECK: util.func public @batch_matmul_i8i8i32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<64x100x250xi8>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<64x250x500xi8>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<64x100x500xi32>
@@ -641,21 +641,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @vecmat_f32f32f32(%arg0 : tensor<250xf32>, %arg1 : tensor<250x500xf32>,
+util.func public @vecmat_f32f32f32(%arg0 : tensor<250xf32>, %arg1 : tensor<250x500xf32>,
     %arg2 : tensor<500xf32>) -> tensor<500xf32> {
   %0 = linalg.vecmat ins(%arg0, %arg1 : tensor<250xf32>, tensor<250x500xf32>)
       outs(%arg2 : tensor<500xf32>) -> tensor<500xf32>
-  return %0 : tensor<500xf32>
+  util.return %0 : tensor<500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d1, d0)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
-//      CHECK: func @vecmat_f32f32f32(
+//      CHECK: util.func public @vecmat_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<250xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250x500xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<500xf32>
@@ -685,21 +685,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[VECMAT]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0] [500] [1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @matvec_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250xf32>,
+util.func public @matvec_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250xf32>,
     %arg2 : tensor<100xf32>) -> tensor<100xf32> {
   %0 = linalg.matvec ins(%arg0, %arg1 : tensor<100x250xf32>, tensor<250xf32>)
       outs(%arg2 : tensor<100xf32>) -> tensor<100xf32>
-  return %0 : tensor<100xf32>
+  util.return %0 : tensor<100xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d1)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)>
-//      CHECK: func @matvec_f32f32f32(
+//      CHECK: util.func public @matvec_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<100x250xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100xf32>
@@ -729,21 +729,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATVEC]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0] [100] [1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_vecmat_f32f32f32(%arg0 : tensor<3x250xf32>, %arg1 : tensor<3x250x500xf32>,
+util.func public @batch_vecmat_f32f32f32(%arg0 : tensor<3x250xf32>, %arg1 : tensor<3x250x500xf32>,
     %arg2 : tensor<3x500xf32>) -> tensor<3x500xf32> {
   %0 = linalg.batch_vecmat ins(%arg0, %arg1 : tensor<3x250xf32>, tensor<3x250x500xf32>)
       outs(%arg2 : tensor<3x500xf32>) -> tensor<3x500xf32>
-  return %0 : tensor<3x500xf32>
+  util.return %0 : tensor<3x500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d2, d1)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @batch_vecmat_f32f32f32(
+//      CHECK: util.func public @batch_vecmat_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<3x250xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<3x250x500xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<3x500xf32>
@@ -777,21 +777,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[VECMAT]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [3, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matvec_f32f32f32_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?xf32>,
+util.func public @batch_matvec_f32f32f32_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?x?xf32>,
     %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
   %0 = linalg.batch_matvec ins(%arg0, %arg1 : tensor<?x?x?xf32>, tensor<?x?xf32>)
       outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
+  util.return %0 : tensor<?x?xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @batch_matvec_f32f32f32_dynamic(
+//      CHECK: util.func public @batch_matvec_f32f32f32_dynamic(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>
 //  CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
@@ -830,28 +830,28 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATVEC]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [{{.*}}] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @fold_fill_with_set_encoding(%arg0 : index, %arg1 : index)
+util.func public @fold_fill_with_set_encoding(%arg0 : index, %arg1 : index)
   -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>> {
   %cst = arith.constant 0.0 : f32
   %0 = tensor.empty(%arg0, %arg1) : tensor<?x?xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %2 = iree_linalg_ext.set_encoding %1 : tensor<?x?xf32>
       -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
-  return %2 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
+  util.return %2 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 }
-//      CHECK: func @fold_fill_with_set_encoding(
+//      CHECK: util.func public @fold_fill_with_set_encoding(
 //      CHECK:   %[[EMPTY:.+]] = tensor.empty(%{{.+}}, %{{.+}}) : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>
 //      CHECK:   %[[FILL:.+]] = linalg.fill
 // CHECK-SAME:       outs(%[[EMPTY]] : tensor<?x?xf32, #iree_linalg_ext.encoding<role = LHS, element_types = [f32, f32, f32]>>)
-//      CHECK:   return %[[FILL]]
+//      CHECK:   util.return %[[FILL]]
 
 // -----
 
-func.func @fold_fill_with_tensor_pad(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index)
+util.func public @fold_fill_with_tensor_pad(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index)
     -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>> {
   %cst = arith.constant 0.0 : f32
   %0 = tensor.empty(%arg0, %arg1) : tensor<?x?xf32>
@@ -862,14 +862,14 @@
   } : tensor<?x?xf32> to tensor<?x?xf32>
   %3 = iree_linalg_ext.set_encoding %2 : tensor<?x?xf32>
       -> tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
-  return %3 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
+  util.return %3 : tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
 }
-//      CHECK: func @fold_fill_with_tensor_pad(
+//      CHECK: util.func public @fold_fill_with_tensor_pad(
 //      CHECK:   %[[EMPTY:.+]] = tensor.empty(
 // CHECK-SAME:       tensor<?x?xf32, #iree_linalg_ext.encoding<role = RESULT, element_types = [f32, f32, f32]>>
 //      CHECK:   %[[FILL:.+]] = linalg.fill
 // CHECK-SAME:       outs(%[[EMPTY]] :
-//      CHECK:   return %[[FILL]]
+//      CHECK:   util.return %[[FILL]]
 
 // -----
 
@@ -882,7 +882,7 @@
     translation_info  = <CPUDefault>>
 
 
-func.func @preset_compilation_info(
+util.func public @preset_compilation_info(
     %arg0 : tensor<?x?xf32>,
     %arg1 : tensor<?x?xf32>,
     %arg2 : tensor<?x?xf32>,
@@ -893,9 +893,9 @@
       outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
   %1 = linalg.batch_matmul {compilation_info = #compilation1} ins(%arg3, %arg4 : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
       outs(%arg5 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-  return %0, %1 : tensor<?x?xf32>, tensor<?x?x?xf32>
+  util.return %0, %1 : tensor<?x?xf32>, tensor<?x?x?xf32>
 }
-// CHECK-LABEL: func.func @preset_compilation_info
+// CHECK-LABEL: util.func public @preset_compilation_info
 // CHECK-NOT:     set_encoding
 // CHECK-NOT:     unset_encoding
 // CHECK:         linalg.matmul
@@ -903,7 +903,7 @@
 
 // -----
 
-func.func @batch_matmul_truncf_f16f16f32(%arg0 : tensor<64x100x250xf32>, %arg1 : tensor<64x250x500xf32>,
+util.func public @batch_matmul_truncf_f16f16f32(%arg0 : tensor<64x100x250xf32>, %arg1 : tensor<64x250x500xf32>,
       %arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32> {
   %0 = tensor.empty() : tensor<64x250x500xf16>
   %casted0 = arith.truncf %arg0 : tensor<64x100x250xf32> to tensor<64x100x250xf16>
@@ -918,10 +918,10 @@
   } -> tensor<64x250x500xf16>
   %1 = linalg.batch_matmul ins(%casted0, %casted1 : tensor<64x100x250xf16>, tensor<64x250x500xf16>)
       outs(%arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32>
-  return %1 : tensor<64x100x500xf32>
+  util.return %1 : tensor<64x100x500xf32>
 }
 
-//      CHECK: func @batch_matmul_truncf_f16f16f32(%[[ARG0:.+]]: tensor<64x100x250xf32>, %[[ARG1:.+]]: tensor<64x250x500xf32>
+//      CHECK: util.func public @batch_matmul_truncf_f16f16f32(%[[ARG0:.+]]: tensor<64x100x250xf32>, %[[ARG1:.+]]: tensor<64x250x500xf32>
 //  CHECK-DAG: %[[INIT:.+]] = tensor.empty() : tensor<64x250x500xf16>
 //  CHECK-DAG: arith.truncf %[[ARG0]] : tensor<64x100x250xf32> to tensor<64x100x250xf16>
 //      CHECK: linalg.generic
@@ -931,26 +931,26 @@
 
 // -----
 
-func.func @matmul_casted_from_i1_f32f32f32(%arg0 : tensor<64x256xi1>,
+util.func public @matmul_casted_from_i1_f32f32f32(%arg0 : tensor<64x256xi1>,
     %arg1 : tensor<256x128xf32>) -> tensor<64x128xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %casted = arith.uitofp %arg0 : tensor<64x256xi1> to tensor<64x256xf32>
   %0 = tensor.empty() : tensor<64x128xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64x128xf32>) -> tensor<64x128xf32>
   %2 = linalg.matmul ins(%casted, %arg1 : tensor<64x256xf32>, tensor<256x128xf32>) outs(%1 : tensor<64x128xf32>) -> tensor<64x128xf32>
-  return %2 : tensor<64x128xf32>
+  util.return %2 : tensor<64x128xf32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func.func @matmul_casted_from_i1_f32f32f32
+//      CHECK: util.func public @matmul_casted_from_i1_f32f32f32
 // CHECK:         set_encoding {{.+}} tensor<?x?xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], original_type = tensor<64x256xf32>, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 // CHECK:         set_encoding {{.+}} tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], original_type = tensor<256x128xf32>, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 // CHECK:         set_encoding {{.+}} tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], original_type = tensor<64x128xf32>, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 
 // -----
 
-func.func @matmul_generic_casted_from_i1_f32f32f32(%arg0 : tensor<64x256xi1>,
+util.func public @matmul_generic_casted_from_i1_f32f32f32(%arg0 : tensor<64x256xi1>,
     %arg1 : tensor<256x128xf32>) -> tensor<64x128xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %init = tensor.empty() : tensor<64x256xf32>
@@ -966,28 +966,28 @@
   %0 = tensor.empty() : tensor<64x128xf32>
   %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<64x128xf32>) -> tensor<64x128xf32>
   %2 = linalg.matmul ins(%casted, %arg1 : tensor<64x256xf32>, tensor<256x128xf32>) outs(%1 : tensor<64x128xf32>) -> tensor<64x128xf32>
-  return %2 : tensor<64x128xf32>
+  util.return %2 : tensor<64x128xf32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func.func @matmul_generic_casted_from_i1_f32f32f32
+//      CHECK: util.func public @matmul_generic_casted_from_i1_f32f32f32
 // CHECK:         set_encoding {{.+}} tensor<?x?xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], original_type = tensor<64x256xf32>, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 // CHECK:         set_encoding {{.+}} tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], original_type = tensor<256x128xf32>, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 // CHECK:         set_encoding {{.+}} tensor<?x?xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], original_type = tensor<64x128xf32>, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 
 // -----
 
-func.func @matmul_f32f32f32_narrow_M(%arg0 : tensor<2x250xf32>, %arg1 : tensor<250x500xf32>,
+util.func public @matmul_f32f32f32_narrow_M(%arg0 : tensor<2x250xf32>, %arg1 : tensor<250x500xf32>,
     %arg2 : tensor<2x500xf32>) -> tensor<2x500xf32> {
   %0 = linalg.matmul ins(%arg0, %arg1 : tensor<2x250xf32>, tensor<250x500xf32>)
       outs(%arg2 : tensor<2x500xf32>) -> tensor<2x500xf32>
-  return %0 : tensor<2x500xf32>
+  util.return %0 : tensor<2x500xf32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_f32f32f32_narrow_M(
+//      CHECK: util.func public @matmul_f32f32f32_narrow_M(
 //      CHECK:  iree_linalg_ext.upper_bound_tile_size tensor<2x250xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], matmul_narrow_M = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 //      CHECK:  iree_linalg_ext.upper_bound_tile_size tensor<250x500xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], matmul_narrow_M = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 //      CHECK:  iree_linalg_ext.upper_bound_tile_size tensor<2x500xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], matmul_narrow_M = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
@@ -995,16 +995,16 @@
 
 // -----
 
-func.func @batch_matmul_f32f32f32_narrow_MN(%arg0 : tensor<64x4x250xf32>, %arg1 : tensor<64x250x2xf32>,
+util.func public @batch_matmul_f32f32f32_narrow_MN(%arg0 : tensor<64x4x250xf32>, %arg1 : tensor<64x250x2xf32>,
     %arg2 : tensor<64x4x2xf32>) -> tensor<64x4x2xf32> {
   %0 = linalg.batch_matmul ins(%arg0, %arg1 : tensor<64x4x250xf32>, tensor<64x250x2xf32>)
       outs(%arg2 : tensor<64x4x2xf32>) -> tensor<64x4x2xf32>
-  return %0 : tensor<64x4x2xf32>
+  util.return %0 : tensor<64x4x2xf32>
 }
 //  CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_f32f32f32_narrow_MN(
+//      CHECK: util.func public @batch_matmul_f32f32f32_narrow_MN(
 //      CHECK:   iree_linalg_ext.upper_bound_tile_size tensor<64x4x250xf32, #iree_linalg_ext.encoding<role =  LHS, element_types = [f32, f32, f32], matmul_narrow_M = 4 : index, matmul_narrow_N = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 //      CHECK:   iree_linalg_ext.upper_bound_tile_size tensor<64x250x2xf32, #iree_linalg_ext.encoding<role =  RHS, element_types = [f32, f32, f32], matmul_narrow_M = 4 : index, matmul_narrow_N = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
 //      CHECK:   iree_linalg_ext.upper_bound_tile_size tensor<64x4x2xf32, #iree_linalg_ext.encoding<role =  RESULT, element_types = [f32, f32, f32], matmul_narrow_M = 4 : index, matmul_narrow_N = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>>
@@ -1012,18 +1012,18 @@
 
 // -----
 
-func.func @matmul_transpose_a_f32f32f32(%arg0 : tensor<250x100xf32>, %arg1 : tensor<250x500xf32>,
+util.func public @matmul_transpose_a_f32f32f32(%arg0 : tensor<250x100xf32>, %arg1 : tensor<250x500xf32>,
     %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> {
   %0 = linalg.matmul_transpose_a ins(%arg0, %arg1 : tensor<250x100xf32>, tensor<250x500xf32>)
       outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32>
-  return %0 : tensor<100x500xf32>
+  util.return %0 : tensor<100x500xf32>
 }
 
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_transpose_a_f32f32f32(
+//      CHECK: util.func public @matmul_transpose_a_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<250x100xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<250x500xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100x500xf32>
@@ -1056,21 +1056,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @matmul_transpose_b_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<500x250xf32>,
+util.func public @matmul_transpose_b_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<500x250xf32>,
     %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> {
   %0 = linalg.matmul_transpose_b ins(%arg0, %arg1 : tensor<100x250xf32>, tensor<500x250xf32>)
       outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32>
-  return %0 : tensor<100x500xf32>
+  util.return %0 : tensor<100x500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @matmul_transpose_b_f32f32f32(
+//      CHECK: util.func public @matmul_transpose_b_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<100x250xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<500x250xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<100x500xf32>
@@ -1103,21 +1103,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_transpose_a_f32f32f32(%arg0 : tensor<2x250x100xf32>, %arg1 : tensor<2x250x500xf32>,
+util.func public @batch_matmul_transpose_a_f32f32f32(%arg0 : tensor<2x250x100xf32>, %arg1 : tensor<2x250x500xf32>,
     %arg2 : tensor<2x100x500xf32>) -> tensor<2x100x500xf32> {
   %0 = linalg.batch_matmul_transpose_a ins(%arg0, %arg1 : tensor<2x250x100xf32>, tensor<2x250x500xf32>)
       outs(%arg2 : tensor<2x100x500xf32>) -> tensor<2x100x500xf32>
-  return %0 : tensor<2x100x500xf32>
+  util.return %0 : tensor<2x100x500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_transpose_a_f32f32f32(
+//      CHECK: util.func public @batch_matmul_transpose_a_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<2x250x100xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<2x250x500xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<2x100x500xf32>
@@ -1154,21 +1154,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [2, 100, 500] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @batch_matmul_transpose_b_f32f32f32(%arg0 : tensor<2x100x250xf32>, %arg1 : tensor<2x500x250xf32>,
+util.func public @batch_matmul_transpose_b_f32f32f32(%arg0 : tensor<2x100x250xf32>, %arg1 : tensor<2x500x250xf32>,
     %arg2 : tensor<2x100x500xf32>) -> tensor<2x100x500xf32> {
   %0 = linalg.batch_matmul_transpose_b ins(%arg0, %arg1 : tensor<2x100x250xf32>, tensor<2x500x250xf32>)
       outs(%arg2 : tensor<2x100x500xf32>) -> tensor<2x100x500xf32>
-  return %0 : tensor<2x100x500xf32>
+  util.return %0 : tensor<2x100x500xf32>
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
-//      CHECK: func @batch_matmul_transpose_b_f32f32f32(
+//      CHECK: util.func public @batch_matmul_transpose_b_f32f32f32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<2x100x250xf32>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<2x500x250xf32>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<2x100x500xf32>
@@ -1205,11 +1205,11 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[BATCH_MATMUL]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [2, 100, 500] [1, 1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @generic_batch_vecmat_transposed_i16u4i32(%arg0 : tensor<32x128xi16>, %arg1 : tensor<4096x32x128xi4>,
+util.func public @generic_batch_vecmat_transposed_i16u4i32(%arg0 : tensor<32x128xi16>, %arg1 : tensor<4096x32x128xi4>,
     %arg2 : tensor<4096x32xi32>) -> tensor<4096x32xi32> {
   %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<32x128xi16>, tensor<4096x32x128xi4>) outs(%arg2 : tensor<4096x32xi32>) {
   ^bb0(%in: i16, %in_5: i4, %out: i32):
@@ -1219,14 +1219,14 @@
     %25 = arith.addi %24, %out : i32
     linalg.yield %25 : i32
   } -> tensor<4096x32xi32>
-  return %0 : tensor<4096x32xi32>
+  util.return %0 : tensor<4096x32xi32>
 }
 
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
 //  CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //  CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
-//      CHECK: func @generic_batch_vecmat_transposed_i16u4i32(
+//      CHECK: util.func public @generic_batch_vecmat_transposed_i16u4i32(
 // CHECK-SAME:     %[[ARG0:.+]]: tensor<32x128xi16>
 // CHECK-SAME:     %[[ARG1:.+]]: tensor<4096x32x128xi4>
 // CHECK-SAME:     %[[ARG2:.+]]: tensor<4096x32xi32>
@@ -1262,21 +1262,21 @@
 // CHECK-SAME:       outs(%[[OUTS]] :
 //      CHECK:   %[[RESULT_PADDED:.+]] = iree_linalg_ext.unset_encoding %[[GENERIC]]
 //      CHECK:   %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [4096, 32] [1, 1]
-//      CHECK:   return %[[RESULT]]
+//      CHECK:   util.return %[[RESULT]]
 
 // -----
 
-func.func @dot(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> tensor<f32> {
+util.func public @dot(%arg0: tensor<1024xf32>, %arg1: tensor<1024xf32>) -> tensor<f32> {
   %res = "stablehlo.dot"(%arg0, %arg1) : (tensor<1024xf32>, tensor<1024xf32>) -> tensor<f32>
-  return %res : tensor<f32>
+  util.return %res : tensor<f32>
 }
 
-// CHECK: func @dot(
+// CHECK: util.func public @dot(
 // CHECK: stablehlo.dot %{{.*}}, %{{.*}} : (tensor<1024xf32>, tensor<1024xf32>) -> tensor<f32>
 
 // -----
 
-func.func @multi_m_dim_generic(%arg0 : tensor<64x4x128xf32>, %arg1 : tensor<128x512xf32>,
+util.func public @multi_m_dim_generic(%arg0 : tensor<64x4x128xf32>, %arg1 : tensor<128x512xf32>,
     %arg2 : tensor<64x4x512xf32>) -> tensor<64x4x512xf32> {
     %4 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>,
@@ -1289,17 +1289,17 @@
       %6 = arith.addf %5, %out : f32
       linalg.yield %6 : f32
     } -> tensor<64x4x512xf32>
-  return %4 : tensor<64x4x512xf32>
+  util.return %4 : tensor<64x4x512xf32>
 }
 
-//      CHECK: func @multi_m_dim_generic(
+//      CHECK: util.func public @multi_m_dim_generic(
 //      CHECK:   linalg.generic
 // CHECK-SAME:      ins(%{{.*}}, %{{.*}} : tensor<64x4x128xf32>, tensor<128x512xf32>)
 // CHECK-SAME:      outs(%{{.*}} : tensor<64x4x512xf32>)
 
 // -----
 
-func.func @multi_n_dim_generic(%arg0 : tensor<256x128xf32>, %arg1 : tensor<128x64x8xf32>,
+util.func public @multi_n_dim_generic(%arg0 : tensor<256x128xf32>, %arg1 : tensor<128x64x8xf32>,
     %arg2 : tensor<256x64x8xf32>) -> tensor<256x64x8xf32> {
     %4 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2)>,
@@ -1312,17 +1312,17 @@
       %6 = arith.addf %5, %out : f32
       linalg.yield %6 : f32
     } -> tensor<256x64x8xf32>
-  return %4 : tensor<256x64x8xf32>
+  util.return %4 : tensor<256x64x8xf32>
 }
 
-//      CHECK: func @multi_n_dim_generic(
+//      CHECK: util.func public @multi_n_dim_generic(
 //      CHECK:   linalg.generic
 // CHECK-SAME:      ins(%{{.*}}, %{{.*}} : tensor<256x128xf32>, tensor<128x64x8xf32>)
 // CHECK-SAME:      outs(%{{.*}} : tensor<256x64x8xf32>)
 
 // -----
 
-func.func @multi_k_dim_generic(%arg0 : tensor<256x64x2xf32>, %arg1 : tensor<64x2x512xf32>,
+util.func public @multi_k_dim_generic(%arg0 : tensor<256x64x2xf32>, %arg1 : tensor<64x2x512xf32>,
     %arg2 : tensor<256x512xf32>) -> tensor<256x512xf32> {
     %4 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)>,
@@ -1335,17 +1335,17 @@
       %6 = arith.addf %5, %out : f32
       linalg.yield %6 : f32
     } -> tensor<256x512xf32>
-  return %4 : tensor<256x512xf32>
+  util.return %4 : tensor<256x512xf32>
 }
 
-//      CHECK: func @multi_k_dim_generic(
+//      CHECK: util.func public @multi_k_dim_generic(
 //      CHECK:   linalg.generic
 // CHECK-SAME:      ins(%{{.*}}, %{{.*}} : tensor<256x64x2xf32>, tensor<64x2x512xf32>)
 // CHECK-SAME:      outs(%{{.*}} : tensor<256x512xf32>)
 
 // -----
 
-func.func @multi_batch_dim_generic(%arg0 : tensor<4x8x256x128xf32>, %arg1 : tensor<4x8x128x512xf32>,
+util.func public @multi_batch_dim_generic(%arg0 : tensor<4x8x256x128xf32>, %arg1 : tensor<4x8x128x512xf32>,
     %arg2 : tensor<4x8x256x512xf32>) -> tensor<4x8x256x512xf32> {
     %4 = linalg.generic {
         indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>,
@@ -1358,10 +1358,10 @@
       %6 = arith.addf %5, %out : f32
       linalg.yield %6 : f32
     } -> tensor<4x8x256x512xf32>
-  return %4 : tensor<4x8x256x512xf32>
+  util.return %4 : tensor<4x8x256x512xf32>
 }
 
-//      CHECK: func @multi_batch_dim_generic(
+//      CHECK: util.func public @multi_batch_dim_generic(
 //      CHECK:   linalg.generic
 // CHECK-SAME:      ins(%{{.*}}, %{{.*}} : tensor<4x8x256x128xf32>, tensor<4x8x128x512xf32>)
-// CHECK-SAME:      outs(%{{.*}} : tensor<4x8x256x512xf32>)
\ No newline at end of file
+// CHECK-SAME:      outs(%{{.*}} : tensor<4x8x256x512xf32>)

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/transformation_pipeline.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/transformation_pipeline.mlir
index 0482ddf..3581392 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/transformation_pipeline.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/transformation_pipeline.mlir

@@ -1,21 +1,21 @@
 // RUN: iree-opt --split-input-file --iree-global-optimization-transformation-pipeline %s | FileCheck %s
 
 // CHECK-LABEL: @empty
-func.func @empty() {
-  // CHECK-NEXT: return
-  return
+util.func public @empty() {
+  // CHECK-NEXT: util.return
+  util.return
 }
 
 // -----
 
-func.func @elementwiseOps(%arg0 : tensor<4xf32>) -> tensor<4xf32> {
+util.func public @elementwiseOps(%arg0 : tensor<4xf32>) -> tensor<4xf32> {
   %0 = arith.addf %arg0, %arg0 : tensor<4xf32>
   %1 = arith.subf %0, %arg0 : tensor<4xf32>
   %2 = arith.mulf %1, %arg0 : tensor<4xf32>
-  return %2 : tensor<4xf32>
+  util.return %2 : tensor<4xf32>
 }
 
-// CHECK-LABEL: func.func @elementwiseOps(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK-LABEL: util.func public @elementwiseOps(%arg0: tensor<4xf32>) -> tensor<4xf32> {
 //       CHECK:   %{{.+}} = linalg.generic
 //       CHECK:     %{{.+}} = arith.addf %{{.+}}, %{{.+}} : f32
 //       CHECK:   %{{.+}} = linalg.generic

diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/transpose_and_decompose_concat.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/transpose_and_decompose_concat.mlir
index 1841853..3a74b9f 100644
--- a/compiler/src/iree/compiler/GlobalOptimization/test/transpose_and_decompose_concat.mlir
+++ b/compiler/src/iree/compiler/GlobalOptimization/test/transpose_and_decompose_concat.mlir

@@ -1,10 +1,10 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-global-opt-decompose-concat{enable-concat-transposition=true}, cse))" %s | FileCheck %s
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(util.func(iree-global-opt-decompose-concat{enable-concat-transposition=true}, cse))" %s | FileCheck %s
 
-func.func @test_inner_dim_concat(%arg0: tensor<32x?x64xf16>, %arg1: tensor<32x?x64xf16>) -> tensor<32x?x128xf16> {
+util.func public @test_inner_dim_concat(%arg0: tensor<32x?x64xf16>, %arg1: tensor<32x?x64xf16>) -> tensor<32x?x128xf16> {
   %concat = tensor.concat dim(2) %arg0, %arg1 : (tensor<32x?x64xf16>, tensor<32x?x64xf16>) -> tensor<32x?x128xf16>
-  return %concat : tensor<32x?x128xf16>
+  util.return %concat : tensor<32x?x128xf16>
 }
-// CHECK-LABEL: func.func @test_inner_dim_concat
+// CHECK-LABEL: util.func public @test_inner_dim_concat
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x?x64xf16>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x?x64xf16>
 //       CHECK:   %[[T0:.+]] = linalg.transpose ins(%[[ARG0]] : tensor<32x?x64xf16>) {{.*}} permutation = [2, 0, 1]
@@ -12,17 +12,17 @@
 //       CHECK:   %[[SLICE0:.+]] = tensor.insert_slice %[[T0]] {{.*}}[0, 0, 0] [64, 32, %{{.*}}] [1, 1, 1]
 //       CHECK:   %[[SLICE1:.+]] = tensor.insert_slice %[[T1]] into %[[SLICE0]][64, 0, 0] [64, 32, %{{.*}}] [1, 1, 1]
 //       CHECK:   %[[T2:.+]] = linalg.transpose ins(%[[SLICE1]] : tensor<128x32x?xf16>) {{.*}} permutation = [1, 2, 0]
-//       CHECK:   return %[[T2]] : tensor<32x?x128xf16>
+//       CHECK:   util.return %[[T2]] : tensor<32x?x128xf16>
 
 // -----
 
-func.func @test_outer_dim_concat(%arg0: tensor<32x?x64xf16>, %arg1: tensor<32x?x64xf16>) -> tensor<64x?x64xf16> {
+util.func public @test_outer_dim_concat(%arg0: tensor<32x?x64xf16>, %arg1: tensor<32x?x64xf16>) -> tensor<64x?x64xf16> {
   %concat = tensor.concat dim(0) %arg0, %arg1 : (tensor<32x?x64xf16>, tensor<32x?x64xf16>) -> tensor<64x?x64xf16>
-  return %concat : tensor<64x?x64xf16>
+  util.return %concat : tensor<64x?x64xf16>
 }
-// CHECK-LABEL: func.func @test_outer_dim_concat
+// CHECK-LABEL: util.func public @test_outer_dim_concat
 //  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x?x64xf16>
 //  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x?x64xf16>
 //       CHECK:   %[[SLICE0:.+]] = tensor.insert_slice %[[ARG0]] {{.*}}[0, 0, 0] [32, %{{.*}}, 64] [1, 1, 1]
 //       CHECK:   %[[SLICE1:.+]] = tensor.insert_slice %[[ARG1]] into %[[SLICE0]][32, 0, 0] [32, %{{.*}}, 64] [1, 1, 1]
-//       CHECK:   return %[[SLICE1]] : tensor<64x?x64xf16>
+//       CHECK:   util.return %[[SLICE1]] : tensor<64x?x64xf16>

diff --git a/compiler/src/iree/compiler/InputConversion/Common/IREEImportPublic.cpp b/compiler/src/iree/compiler/InputConversion/Common/IREEImportPublic.cpp
index e288e3b..15c0765 100644
--- a/compiler/src/iree/compiler/InputConversion/Common/IREEImportPublic.cpp
+++ b/compiler/src/iree/compiler/InputConversion/Common/IREEImportPublic.cpp

@@ -27,14 +27,6 @@
 
 namespace {
 
-// Allowlist of function attributes to retain when importing funcs.
-constexpr const char *kRetainedAttributes[] = {
-    "iree.abi",
-    "iree.reflection",
-    "sym_visibility",
-    "noinline",
-};
-
 struct IREEImportPublicPass
     : public IREEImportPublicBase<IREEImportPublicPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -295,8 +287,10 @@
 };
 
 //===----------------------------------------------------------------------===//
+// Func dialect -> Util patterns
+//===----------------------------------------------------------------------===//
 
-class BuiltinFuncOpPattern : public OpConversionPattern<func::FuncOp> {
+class FuncFuncOpPattern : public OpConversionPattern<func::FuncOp> {
   using OpConversionPattern<func::FuncOp>::OpConversionPattern;
   LogicalResult
   matchAndRewrite(func::FuncOp srcOp, OpAdaptor adaptor,
@@ -320,26 +314,62 @@
       return rewriter.notifyMatchFailure(srcOp, "results failed to convert");
     }
 
+    // Build tied operands index mapping results back to operands.
+    SmallVector<int64_t> tiedOperands;
+    bool anyTiedOperands = false;
+    for (unsigned i = 0; i < srcFuncType.getNumResults(); ++i) {
+      auto tiedAttr =
+          srcOp.getResultAttrOfType<IntegerAttr>(i, "iree.abi.tied");
+      if (tiedAttr) {
+        tiedOperands.push_back(tiedAttr.getInt());
+      } else {
+        tiedOperands.push_back(-1);
+      }
+    }
+    auto tiedOperandsAttr = anyTiedOperands
+                                ? rewriter.getIndexArrayAttr(tiedOperands)
+                                : ArrayAttr{};
+
     // Create new function with converted argument and result types.
     // Note that attributes are dropped. Consider preserving some if needed.
     auto newFuncType = mlir::FunctionType::get(
         srcOp.getContext(), signatureConversion.getConvertedTypes(),
         convertedResultTypes);
-    auto newFuncOp = rewriter.create<func::FuncOp>(
-        srcOp.getLoc(), srcOp.getName(), newFuncType);
+    auto newFuncOp = rewriter.create<IREE::Util::FuncOp>(
+        srcOp.getLoc(), srcOp.getName(), newFuncType, tiedOperandsAttr);
+    newFuncOp.setSymVisibilityAttr(srcOp.getSymVisibilityAttr());
     rewriter.inlineRegionBefore(srcOp.getBody(), newFuncOp.getFunctionBody(),
                                 newFuncOp.end());
 
-    // Retain function attributes in the allowlist.
+    // Handle defacto attrs to specialized ones.
+    if (srcOp->hasAttr("noinline")) {
+      newFuncOp.setInliningPolicyAttr(
+          rewriter.getAttr<IREE::Util::InlineNeverAttr>());
+    }
+
+    // Allowlist of function attributes to retain when importing funcs.
+    constexpr const char *kRetainedAttributes[] = {
+        "iree.reflection",
+        "vm.fallback",
+        "vm.signature",
+        "vm.version",
+    };
     auto retainedAttributes = ArrayRef<const char *>(
         kRetainedAttributes,
         sizeof(kRetainedAttributes) / sizeof(kRetainedAttributes[0]));
     for (auto retainAttrName : retainedAttributes) {
       StringRef attrName(retainAttrName);
       Attribute attr = srcOp->getAttr(attrName);
-      if (attr) {
+      if (attr)
         newFuncOp->setAttr(attrName, attr);
-      }
+    }
+
+    // Copy all arg/result attrs. We could filter these.
+    if (auto argAttrs = srcOp.getAllArgAttrs()) {
+      newFuncOp.setAllArgAttrs(argAttrs);
+    }
+    if (auto resultAttrs = srcOp.getAllResultAttrs()) {
+      newFuncOp.setAllResultAttrs(resultAttrs);
     }
 
     // Tell the rewriter to convert the region signature.
@@ -355,6 +385,40 @@
   }
 };
 
+class FuncCallOpPattern : public OpConversionPattern<func::CallOp> {
+  using OpConversionPattern<func::CallOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(func::CallOp srcOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<Type, 1> resultTypes;
+    if (failed(getTypeConverter()->convertTypes(srcOp.getResultTypes(),
+                                                resultTypes))) {
+      return rewriter.notifyMatchFailure(srcOp, "results failed to convert");
+    }
+    auto tiedOperandsAttr =
+        srcOp->getAttrOfType<ArrayAttr>("iree.abi.tied_operands");
+    rewriter.replaceOpWithNewOp<IREE::Util::CallOp>(
+        srcOp, resultTypes, srcOp.getCallee(), adaptor.getOperands(),
+        tiedOperandsAttr);
+    return success();
+  }
+};
+
+class FuncReturnOpPattern : public OpConversionPattern<func::ReturnOp> {
+  using OpConversionPattern<func::ReturnOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(func::ReturnOp srcOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<IREE::Util::ReturnOp>(srcOp,
+                                                      adaptor.getOperands());
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Generic conversion
+//===----------------------------------------------------------------------===//
+
 class GlobalOpPattern : public OpConversionPattern<IREE::Input::GlobalOp> {
   using OpConversionPattern::OpConversionPattern;
   LogicalResult
@@ -466,37 +530,30 @@
     }
     return true;
   };
-
-  target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp funcOp) {
-    for (Type type : funcOp.getFunctionType().getInputs()) {
-      if (isIllegalType(type))
-        return false;
-    }
-    for (Type type : funcOp.getFunctionType().getResults()) {
-      if (isIllegalType(type))
-        return false;
-    }
-    for (Block &block : funcOp.getFunctionBody()) {
-      for (Type type : block.getArgumentTypes()) {
-        if (isIllegalType(type))
-          return false;
-      }
-    }
-    return true;
-  });
   target.markUnknownOpDynamicallyLegal(isLegallyTypedOp);
 
   IREETypeConverter typeConverter;
   PatternBenefit specific_benefit = 100;
   patterns.insert<GenericTypeConvert>(typeConverter, &getContext(), 0);
-  patterns.insert<BuiltinFuncOpPattern>(typeConverter, &getContext(),
-                                        specific_benefit);
   patterns.insert<GlobalOpPattern>(typeConverter, &getContext(), 0);
   patterns.insert<TensorExportPattern, TensorImportPattern>(
       typeConverter, &getContext(), specific_benefit);
   patterns.insert<ExecutableSourcePattern, ExecutableExportPattern>(
       typeConverter, &getContext(), specific_benefit);
 
+  target.addDynamicallyLegalDialect<func::FuncDialect>(
+      [&](Operation *op) -> std::optional<bool> {
+        // Allow the func dialect within nested modules but not in the top-level
+        // one that represents the host program.
+        return op->getParentOfType<mlir::ModuleOp>() != getOperation();
+      });
+  patterns.insert<FuncFuncOpPattern>(typeConverter, &getContext(),
+                                     specific_benefit);
+  patterns.insert<FuncCallOpPattern>(typeConverter, &getContext(),
+                                     specific_benefit);
+  patterns.insert<FuncReturnOpPattern>(typeConverter, &getContext(),
+                                       specific_benefit);
+
 #define ONE_TO_ONE(SrcOpTy, TargetOpTy)                                        \
   patterns.insert<OneToOneConverionPattern>(                                   \
       typeConverter, SrcOpTy::getOperationName(),                              \

diff --git a/compiler/src/iree/compiler/InputConversion/Common/ImportMLProgram.cpp b/compiler/src/iree/compiler/InputConversion/Common/ImportMLProgram.cpp
index 8f680ad..d112f5e 100644
--- a/compiler/src/iree/compiler/InputConversion/Common/ImportMLProgram.cpp
+++ b/compiler/src/iree/compiler/InputConversion/Common/ImportMLProgram.cpp

@@ -12,7 +12,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MLProgram/IR/MLProgram.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -30,7 +29,7 @@
 
 struct ImportMLProgramPass : public ImportMLProgramBase<ImportMLProgramPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<IREE::Util::UtilDialect, func::FuncDialect>();
+    registry.insert<arith::ArithDialect, IREE::Util::UtilDialect>();
   }
   void runOnOperation() override;
 };
@@ -171,11 +170,11 @@
       FunctionType funcType =
           rewriter.getFunctionType(/*input=*/TypeRange{}, /*outputs=*/newType);
       ImplicitLocOpBuilder b(globalOp.getLoc(), rewriter);
-      auto funcOp = b.create<func::FuncOp>(getterName, funcType);
+      auto funcOp = b.create<IREE::Util::FuncOp>(getterName, funcType);
       funcOp.setPublic();
       b.setInsertionPointToStart(funcOp.addEntryBlock());
       auto val = globalOp.createLoadOp(globalOp.getLoc(), b);
-      b.create<func::ReturnOp>(val.getLoadedGlobalValue());
+      b.create<IREE::Util::ReturnOp>(val.getLoadedGlobalValue());
     }
 
     if (!setterName.empty() && isMutable) {
@@ -183,11 +182,11 @@
       FunctionType funcType =
           rewriter.getFunctionType(/*input=*/newType, /*outputs=*/TypeRange{});
       ImplicitLocOpBuilder b(globalOp.getLoc(), rewriter);
-      auto funcOp = b.create<func::FuncOp>(setterName, funcType);
+      auto funcOp = b.create<IREE::Util::FuncOp>(setterName, funcType);
       funcOp.setPublic();
       b.setInsertionPointToStart(funcOp.addEntryBlock());
       globalOp.createStoreOp(globalOp.getLoc(), funcOp.getArgument(0), b);
-      b.create<func::ReturnOp>();
+      b.create<IREE::Util::ReturnOp>();
     }
 
     return success();
@@ -210,7 +209,8 @@
       /*input=*/TypeRange{IREE::Util::ListType::get(
           IREE::Util::VariantType::get(context))},
       /*outputs=*/{});
-  auto funcOp = b.create<func::FuncOp>("ireeMlProgramGlobalsInit", funcType);
+  auto funcOp =
+      b.create<IREE::Util::FuncOp>("ireeMlProgramGlobalsInit", funcType);
   funcOp.setPublic();
   b.setInsertionPointToStart(funcOp.addEntryBlock());
 
@@ -221,7 +221,7 @@
     b.create<IREE::Util::GlobalStoreOp>(val, it.value().name);
   }
 
-  b.create<func::ReturnOp>();
+  b.create<IREE::Util::ReturnOp>();
 
   return success();
 }

diff --git a/compiler/src/iree/compiler/InputConversion/Common/test/import_ml_program.mlir b/compiler/src/iree/compiler/InputConversion/Common/test/import_ml_program.mlir
index 3084675..214bbf5 100644
--- a/compiler/src/iree/compiler/InputConversion/Common/test/import_ml_program.mlir
+++ b/compiler/src/iree/compiler/InputConversion/Common/test/import_ml_program.mlir

@@ -5,12 +5,12 @@
     ml_program.public_global_accessors = {
       get = "global${0}$get", set = "global${0}$set"}} {
   // CHECK: util.global private mutable @global_pubmut = 51 : i32
-  // CHECK: func @global$global_pubmut$get() -> i32
-  // CHECK: func @global$global_pubmut$set(%{{.*}}: i32)
+  // CHECK: util.func public @global$global_pubmut$get() -> i32
+  // CHECK: util.func public @global$global_pubmut$set(%{{.*}}: i32)
   // CHECK-NOT: func
   ml_program.global public mutable @global_pubmut(51 : i32) : i32
   // CHECK: util.global private @global_pub = 52 : i32
-  // CHECK: func @global$global_pub$get() -> i32
+  // CHECK: util.func public @global$global_pub$get() -> i32
   // CHECK-NOT: func
   ml_program.global public @global_pub(52 : i32) : i32
   // CHECK: util.global private mutable @global_privmut = 53 : i32
@@ -25,8 +25,8 @@
 builtin.module @globals attributes {
     ml_program.public_global_accessors = {get = "global__{0}__get"}} {
   // CHECK: util.global private mutable @global_pubmut = 51 : i32
-  // CHECK: func @global__global_pubmut__get() -> i32
-  // CHECK: func @global$global_pubmut$set
+  // CHECK: util.func public @global__global_pubmut__get() -> i32
+  // CHECK: util.func public @global$global_pubmut$set
   ml_program.global public mutable @global_pubmut(51 : i32) : i32
 }
 
@@ -34,8 +34,8 @@
 // CHECK-LABEL: module @no_accessors_globals
 builtin.module @no_accessors_globals {
   // CHECK: util.global private mutable @global_pubmut = 51 : i32
-  // CHECK: func @global$global_pubmut$get() -> i32
-  // CHECK: func @global$global_pubmut$set(%{{.*}}: i32)
+  // CHECK: util.func public @global$global_pubmut$get() -> i32
+  // CHECK: util.func public @global$global_pubmut$set(%{{.*}}: i32)
   ml_program.global public mutable @global_pubmut(51 : i32) : i32
 }
 
@@ -43,10 +43,10 @@
 // CHECK-LABEL: module @global_load
 builtin.module @global_load {
   ml_program.global private @v_loaded(dense<0> : tensor<4xi32>)  : tensor<4xi32>
-  func.func @loaded() {
+  util.func @loaded() {
     // CHECK: util.global.load @v_loaded : tensor<4xi32>
     %0 = ml_program.global_load @v_loaded : tensor<4xi32>
-    return
+    util.return
   }
 }
 
@@ -54,10 +54,10 @@
 // CHECK-LABEL: module @global_load_const
 builtin.module @global_load_const {
   ml_program.global private @v_loaded(dense<0> : tensor<4xi32>)  : tensor<4xi32>
-  func.func @loaded() {
+  util.func @loaded() {
     // CHECK: util.global.load @v_loaded : tensor<4xi32>
     %0 = ml_program.global_load_const @v_loaded : tensor<4xi32>
-    return
+    util.return
   }
 }
 
@@ -65,12 +65,12 @@
 // CHECK-LABEL: module @global_store
 builtin.module @global_store {
   ml_program.global private mutable @v_stored : tensor<4xi32>
-  func.func @stored() {
+  util.func @stored() {
     // CHECK: %[[CST:.*]] = arith.constant
     %cst = arith.constant dense<5> : tensor<4xi32>
     // CHECK: util.global.store %[[CST]], @v_stored : tensor<4xi32>
     ml_program.global_store @v_stored = %cst : tensor<4xi32>
-    return
+    util.return
   }
 }
 
@@ -88,7 +88,7 @@
 // CHECK-DAG: util.global private mutable @global_privmut : i32
 // CHECK-DAG: util.global private mutable @global_priv : i32
 
-// CHECK-LABEL: func.func @ireeMlProgramGlobalsInit(
+// CHECK-LABEL: util.func public @ireeMlProgramGlobalsInit(
 // CHECK-SAME:    %[[VAL_0:.*]]: !util.list<?>
 // CHECK:  %[[VAL_1:.*]] = arith.constant 0 : index
 // CHECK:  %[[VAL_2:.*]] = util.list.get %[[VAL_0]]{{\[}}%[[VAL_1]]] : !util.list<?> -> i32

diff --git a/compiler/src/iree/compiler/InputConversion/Common/test/iree_import_public.mlir b/compiler/src/iree/compiler/InputConversion/Common/test/iree_import_public.mlir
index 31ad781..b7d746d 100644
--- a/compiler/src/iree/compiler/InputConversion/Common/test/iree_import_public.mlir
+++ b/compiler/src/iree/compiler/InputConversion/Common/test/iree_import_public.mlir

@@ -1,53 +1,67 @@
 // RUN: iree-opt --split-input-file --iree-import-public %s | FileCheck %s
 
-// CHECK-LABEL: func.func @b_func
+// CHECK-LABEL: util.func private @private_func
+// CHECK: util.return
+func.func private @private_func() -> () {
+  return
+}
+
+// -----
+// CHECK-LABEL: util.func public @noinline_func
+// CHECK: inlining_policy = #util.inline.never
+func.func @noinline_func() -> () attributes {noinline} {
+  return
+}
+
+// -----
+// CHECK-LABEL: util.func public @b_func
 // CHECK-SAME: (%arg0: !hal.buffer, %arg1: !hal.buffer) -> (!hal.buffer, !hal.buffer)
-// CHECK: return %arg0, %arg1 : !hal.buffer, !hal.buffer
+// CHECK: util.return %arg0, %arg1 : !hal.buffer, !hal.buffer
 func.func @b_func(%arg0 : !iree_input.buffer, %arg1 : !iree_input.buffer) -> (!iree_input.buffer, !iree_input.buffer) {
   return %arg0, %arg1 : !iree_input.buffer, !iree_input.buffer
 }
 
 // -----
-// CHECK-LABEL: func.func @bv_func
+// CHECK-LABEL: util.func public @bv_func
 // CHECK-SAME: (%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> (!hal.buffer_view, !hal.buffer_view)
-// CHECK: return %arg0, %arg1 : !hal.buffer_view, !hal.buffer_view
+// CHECK: util.return %arg0, %arg1 : !hal.buffer_view, !hal.buffer_view
 func.func @bv_func(%arg0 : !iree_input.buffer_view, %arg1 : !iree_input.buffer_view) -> (!iree_input.buffer_view, !iree_input.buffer_view) {
   return %arg0, %arg1 : !iree_input.buffer_view, !iree_input.buffer_view
 }
 
 // -----
-// CHECK-LABEL: func.func @list_func
+// CHECK-LABEL: util.func public @list_func
 // CHECK-SAME: (%arg0: !util.list<?>) -> !util.list<?>
 func.func @list_func(%arg0 : !iree_input.list<!iree_input.variant>) -> !iree_input.list<!iree_input.variant> {
   return %arg0 : !iree_input.list<!iree_input.variant>
 }
 
 // -----
-// CHECK-LABEL: func.func @list_func_retains_iree_abi
+// CHECK-LABEL: util.func public @list_func_retains_iree_attrs
 // CHECK-SAME: (%arg0: !util.list<?>) -> !util.list<?>
-// CHECK-SAME: iree.abi = "FOOBAR"
-func.func @list_func_retains_iree_abi(%arg0 : !iree_input.list<!iree_input.variant>) -> !iree_input.list<!iree_input.variant>
-    attributes {iree.abi = "FOOBAR"} {
+// CHECK-SAME: iree.reflection = {some.attr}
+func.func @list_func_retains_iree_attrs(%arg0 : !iree_input.list<!iree_input.variant>) -> !iree_input.list<!iree_input.variant>
+    attributes {iree.reflection = {some.attr}} {
   return %arg0 : !iree_input.list<!iree_input.variant>
 }
 
 // -----
-// CHECK-LABEL: func.func @list_func_call
-// CHECK: call @list_func_call(%arg0) : (!util.list<?>) -> !util.list<?>
+// CHECK-LABEL: util.func public @list_func_call
+// CHECK: util.call @list_func_call(%arg0) : (!util.list<?>) -> !util.list<?>
 func.func @list_func_call(%arg0 : !iree_input.list<!iree_input.variant>) -> !iree_input.list<!iree_input.variant> {
   call @list_func_call(%arg0) : (!iree_input.list<!iree_input.variant>) -> !iree_input.list<!iree_input.variant>
   return %arg0 : !iree_input.list<!iree_input.variant>
 }
 
 // -----
-// CHECK-LABEL: func.func @ptr_func
+// CHECK-LABEL: util.func public @ptr_func
 // CHECK-SAME: (%arg0: !util.ptr<!hal.buffer_view>) -> !util.ptr<!hal.buffer_view>
 func.func @ptr_func(%arg0 : !iree_input.ptr<!iree_input.buffer_view>) -> !iree_input.ptr<!iree_input.buffer_view> {
   return %arg0 : !iree_input.ptr<!iree_input.buffer_view>
 }
 
 // -----
-// CHECK-LABEL: func.func @null_op
+// CHECK-LABEL: util.func public @null_op
 // CHECK: util.null : !util.variant
 func.func @null_op() -> !iree_input.variant {
   %0 = iree_input.null : !iree_input.variant
@@ -55,7 +69,7 @@
 }
 
 //----
-// CHECK-LABEL: func.func @buffer_subspan
+// CHECK-LABEL: util.func public @buffer_subspan
 // CHECK-SAME: (%arg0: !hal.buffer) -> !hal.buffer
 // CHECK: %[[OFFSET:.+]] = arith.constant 100
 // CHECK: %[[LENGTH:.+]] = arith.constant 200
@@ -70,7 +84,7 @@
 }
 
 //----
-// CHECK-LABEL: func.func @buffer_view_create
+// CHECK-LABEL: util.func public @buffer_view_create
 // CHECK-SAME: (%arg0: !hal.buffer) -> !hal.buffer_view
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[C2:.*]] = arith.constant 2 : index
@@ -99,7 +113,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_export
+// CHECK-LABEL: util.func public @tensor_export
 // CHECK: hal.tensor.export %arg0 : tensor<?x?x3xf32>{%arg1, %arg2} -> !hal.buffer_view
 func.func @tensor_export(%arg0 : tensor<?x?x3xf32>, %arg1 : index, %arg2 : index) -> !iree_input.buffer_view {
   %0 = iree_input.tensor.export %arg0 : tensor<?x?x3xf32>{%arg1, %arg2} -> !iree_input.buffer_view
@@ -107,7 +121,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_export_static
+// CHECK-LABEL: util.func public @tensor_export_static
 // CHECK: hal.tensor.export %arg0 : tensor<3xf32> -> !hal.buffer_view
 func.func @tensor_export_static(%arg0 : tensor<3xf32>) -> !iree_input.buffer_view {
   %0 = iree_input.tensor.export %arg0 : tensor<3xf32> -> !iree_input.buffer_view
@@ -115,7 +129,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_export_implicit_dims
+// CHECK-LABEL: util.func public @tensor_export_implicit_dims
 // CHECK: %[[ZERO:.*]] = arith.constant 0
 // CHECK: %[[D0:.*]] = tensor.dim %arg0, %[[ZERO]]
 // CHECK: %[[ONE:.*]] = arith.constant 1
@@ -127,7 +141,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_import
+// CHECK-LABEL: util.func public @tensor_import
 // CHECK: hal.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?x3xf32>{%arg1, %arg2}
 func.func @tensor_import(%arg0 : !iree_input.buffer_view, %arg1 : index, %arg2 : index) -> tensor<?x?x3xf32> {
   %0 = iree_input.tensor.import %arg0 : !iree_input.buffer_view -> tensor<?x?x3xf32>{%arg1, %arg2}
@@ -135,7 +149,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_import_static
+// CHECK-LABEL: util.func public @tensor_import_static
 // CHECK: hal.tensor.import %arg0 : !hal.buffer_view -> tensor<3xf32>
 func.func @tensor_import_static(%arg0 : !iree_input.buffer_view) -> tensor<3xf32> {
   %0 = iree_input.tensor.import %arg0 : !iree_input.buffer_view -> tensor<3xf32>
@@ -143,7 +157,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_import_implicit_dims
+// CHECK-LABEL: util.func public @tensor_import_implicit_dims
 // CHECK: %[[D0:.*]] = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
 // CHECK: %[[D1:.*]] = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
 // CHECK: hal.tensor.import %arg0 : !hal.buffer_view -> tensor<?x?x3xf32>{%[[D0]], %[[D1]]}
@@ -153,7 +167,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @buffer_view_rank
+// CHECK-LABEL: util.func public @buffer_view_rank
 // CHECK: hal.buffer_view.rank<%arg0 : !hal.buffer_view> : index
 func.func @buffer_view_rank(%arg0 : !iree_input.buffer_view) -> index {
   %0 = iree_input.buffer_view.rank %arg0 : index
@@ -161,16 +175,16 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @byte_buffer_constant
+// CHECK-LABEL: util.func public @byte_buffer_constant
 // CHECK: %[[B:.*]] = util.buffer.constant "name" {alignment = 64 : index, mime_type = "text/plain"} : !util.buffer = "foo"
-// CHECK: return %[[B]] : !util.buffer
+// CHECK: util.return %[[B]] : !util.buffer
 func.func @byte_buffer_constant() -> !iree_input.byte_buffer {
   %0 = iree_input.byte_buffer.constant "name" {alignment = 64 : index, mime_type = "text/plain"} : !iree_input.byte_buffer = "foo"
   return %0 : !iree_input.byte_buffer
 }
 
 // -----
-// CHECK-LABEL: func.func @buffer_view_dim
+// CHECK-LABEL: util.func public @buffer_view_dim
 // CHECK: hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
 func.func @buffer_view_dim(%arg0 : !iree_input.buffer_view) -> index {
   %0 = iree_input.buffer_view.dim %arg0, 0 : index
@@ -178,7 +192,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @list_create
+// CHECK-LABEL: util.func public @list_create
 // CHECK: util.list.create %arg0 : !util.list<?>
 func.func @list_create(%arg0 : index) -> !iree_input.list<!iree_input.variant> {
   %0 = iree_input.list.create %arg0 : !iree_input.list<!iree_input.variant>
@@ -186,7 +200,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @list_size
+// CHECK-LABEL: util.func public @list_size
 // CHECK: util.list.size %arg0 : !util.list<?>
 func.func @list_size(%arg0 : !iree_input.list<!iree_input.variant>) -> index {
   %0 = iree_input.list.size %arg0 : !iree_input.list<!iree_input.variant>
@@ -194,7 +208,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @list_resize
+// CHECK-LABEL: util.func public @list_resize
 // CHECK: util.list.resize %arg0, %arg1 : !util.list<?>
 func.func @list_resize(%arg0 : !iree_input.list<!iree_input.variant>, %arg1 : index) {
   iree_input.list.resize %arg0, %arg1 : !iree_input.list<!iree_input.variant>
@@ -202,7 +216,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @list_get
+// CHECK-LABEL: util.func public @list_get
 // CHECK: util.list.get %arg0[%arg1] : !util.list<?>
 func.func @list_get(%arg0 : !iree_input.list<!iree_input.variant>, %arg1 : index) -> !iree_input.variant {
   %0 = iree_input.list.get %arg0[%arg1] : !iree_input.list<!iree_input.variant> -> !iree_input.variant
@@ -210,7 +224,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @list_set
+// CHECK-LABEL: util.func public @list_set
 // CHECK: util.list.set %arg0[%arg1], %arg2 : !util.list<?>
 func.func @list_set(%arg0 : !iree_input.list<!iree_input.variant>, %arg1 : index, %arg2 : !iree_input.variant) {
   iree_input.list.set %arg0[%arg1], %arg2 : !iree_input.list<!iree_input.variant>, !iree_input.variant
@@ -218,7 +232,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_reshape
+// CHECK-LABEL: util.func public @tensor_reshape
 // CHECK: flow.tensor.reshape %arg0 : tensor<?x?xf32>{%arg1, %arg2} -> tensor<?x?xf32>{%arg2, %arg1}
 func.func @tensor_reshape(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index) -> tensor<?x?xf32> {
   %0 = iree_input.tensor.reshape %arg0 : tensor<?x?xf32>{%arg1, %arg2} -> tensor<?x?xf32>{%arg2, %arg1}
@@ -226,7 +240,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_load
+// CHECK-LABEL: util.func public @tensor_load
 // CHECK: flow.tensor.load %arg0[%arg2, %arg3] : tensor<?x3xf32>{%arg1}
 func.func @tensor_load(%arg0 : tensor<?x3xf32>, %arg1 : index, %arg2 : index, %arg3 : index) -> f32 {
   %0 = iree_input.tensor.load %arg0[%arg2, %arg3] : tensor<?x3xf32>{%arg1}
@@ -234,7 +248,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_store
+// CHECK-LABEL: util.func public @tensor_store
 // CHECK: flow.tensor.store %arg4, %arg0[%arg2, %arg3] : tensor<?x3xf32>{%arg1}
 func.func @tensor_store(%arg0 : tensor<?x3xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : f32) {
   iree_input.tensor.store %arg4, %arg0[%arg2, %arg3] : tensor<?x3xf32>{%arg1}
@@ -242,7 +256,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_splat
+// CHECK-LABEL: util.func public @tensor_splat
 // CHECK: flow.tensor.splat %arg0 : tensor<?x?xf32>{%arg1, %arg2}
 func.func @tensor_splat(%arg0 : f32, %arg1 : index, %arg2 : index) -> tensor<?x?xf32> {
   %0 = iree_input.tensor.splat %arg0 : tensor<?x?xf32>{%arg1, %arg2}
@@ -250,7 +264,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_clone
+// CHECK-LABEL: util.func public @tensor_clone
 // CHECK: flow.tensor.clone %arg0 : tensor<?x?xf32>{%arg1, %arg2}
 func.func @tensor_clone(%arg0 : tensor<?x?xf32>, %arg1 : index, %arg2 : index) -> tensor<?x?xf32> {
   %0 = iree_input.tensor.clone %arg0 : tensor<?x?xf32>{%arg1, %arg2}
@@ -258,7 +272,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_slice
+// CHECK-LABEL: util.func public @tensor_slice
 // CHECK: flow.tensor.slice %arg0[%arg1 for %arg2] : tensor<?xf32>{%arg3} -> tensor<?xf32>{%arg4}
 func.func @tensor_slice(%arg0 : tensor<?xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) -> tensor<?xf32> {
   %0 = iree_input.tensor.slice %arg0[%arg1 for %arg2] : tensor<?xf32>{%arg3} -> tensor<?xf32>{%arg4}
@@ -266,7 +280,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_update
+// CHECK-LABEL: util.func public @tensor_update
 // CHECK: flow.tensor.update %arg3, %arg0[%arg1] : tensor<?xf32>{%arg2} -> %arg0 as tensor<?xf32>{%arg4}
 func.func @tensor_update(%arg0 : tensor<?xf32>, %arg1 : index, %arg2 : index, %arg3 : tensor<?xf32>, %arg4 : index) -> tensor<?xf32> {
   %0 = iree_input.tensor.update %arg3, %arg0[%arg1] : tensor<?xf32>{%arg2} -> %arg0 as tensor<?xf32>{%arg4}
@@ -274,7 +288,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @tensor_trace
+// CHECK-LABEL: util.func public @tensor_trace
 //      CHECK: flow.tensor.trace "FOOBAR" = [
 // CHECK-SAME:   %arg0 : tensor<5xf32>,
 // CHECK-SAME:   %arg1 : tensor<?x3xf32>{%arg2}
@@ -302,11 +316,11 @@
   // CHECK: util.global public @global5 : tensor<4xi32>
   iree_input.global @global5 initializer(@initializer) : tensor<4xi32>
   // CHECK-NEXT: util.initializer {
-  // CHECK-NEXT:   %[[VALUE:.+]] = func.call @initializer() : () -> tensor<4xi32>
+  // CHECK-NEXT:   %[[VALUE:.+]] = util.call @initializer() : () -> tensor<4xi32>
   // CHECK-NEXT:   util.global.store %[[VALUE]], @global5 : tensor<4xi32>
   // CHECK-NEXT:   util.return
   // CHECK-NEXT: }
-  // CHECK: func.func private @initializer() -> tensor<4xi32>
+  // CHECK: util.func private @initializer() -> tensor<4xi32>
   func.func private @initializer() -> tensor<4xi32>
 }
 
@@ -361,7 +375,7 @@
 }
 
 // -----
-// CHECK-LABEL: func.func @optimization_barrier
+// CHECK-LABEL: util.func public @optimization_barrier
 // CHECK: util.optimization_barrier %arg0 : tensor<f32>
 func.func @optimization_barrier(%arg0 : tensor<f32>) -> tensor<f32> {
   %0 = iree_input.optimization_barrier %arg0 : tensor<f32>
@@ -391,9 +405,7 @@
 #sm75 = #iree_input.executable.target<"cuda", "cuda-nvptx-fb", {
   target_arch = "sm_75"
 }>
-
 builtin.module @executable_source {
-
   iree_input.executable.source private @executable attributes {
     objects = #iree_input.executable.objects<{
       #sm75 = [#iree_input.executable.object<{path = "executable.ptx"}>]
@@ -409,7 +421,6 @@
       workgroup_size = [64 : index, 1 : index, 1 : index]
     }
   }
-
   func.func @dispatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
     %c0 = arith.constant 0 : index
     %0 = flow.dispatch @executable::@add[%c0](%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> %arg1

diff --git a/compiler/src/iree/compiler/Modules/Check/test/canonicalize.mlir b/compiler/src/iree/compiler/Modules/Check/test/canonicalize.mlir
index e5e6a3a..1afdfaa 100644
--- a/compiler/src/iree/compiler/Modules/Check/test/canonicalize.mlir
+++ b/compiler/src/iree/compiler/Modules/Check/test/canonicalize.mlir

@@ -4,20 +4,20 @@
 
 // CHECK-LABEL: @expect_eq_const
 // CHECK-SAME: %[[LHS:[a-zA-Z0-9$._-]+]]
-func.func @expect_eq_const(%lhs : tensor<2x2xi32>) {
+util.func public @expect_eq_const(%lhs : tensor<2x2xi32>) {
   // CHECK: %[[C:.+]] = arith.constant dense<1> : tensor<2x2xi32>
   // CHECK: check.expect_eq(%[[LHS]], %[[C]]) : tensor<2x2xi32>
   check.expect_eq_const(%lhs, dense<1> : tensor<2x2xi32>) : tensor<2x2xi32>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @expect_almost_eq_const
 // CHECK-SAME: %[[LHS:[a-zA-Z0-9$._-]+]]
-func.func @expect_almost_eq_const(%lhs : tensor<2x2xf32>) {
+util.func public @expect_almost_eq_const(%lhs : tensor<2x2xf32>) {
   // CHECK: %[[C:.+]] = arith.constant dense<1.000000e+00> : tensor<2x2xf32>
   // CHECK: check.expect_almost_eq(%[[LHS]], %[[C]]) : tensor<2x2xf32>
   check.expect_almost_eq_const(%lhs, dense<1.0> : tensor<2x2xf32>) : tensor<2x2xf32>
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/Check/test/ops.mlir b/compiler/src/iree/compiler/Modules/Check/test/ops.mlir
index 454e62a..97dc675 100644
--- a/compiler/src/iree/compiler/Modules/Check/test/ops.mlir
+++ b/compiler/src/iree/compiler/Modules/Check/test/ops.mlir

@@ -4,40 +4,40 @@
 
 // CHECK-LABEL: @expect_true
 // CHECK-SAME: %[[ARG:[a-zA-Z0-9$._-]+]]
-func.func @expect_true(%arg : i32) {
+util.func public @expect_true(%arg : i32) {
   // CHECK: check.expect_true(%[[ARG]]) : i32
   check.expect_true(%arg) : i32
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @expect_false
 // CHECK-SAME: %[[ARG:[a-zA-Z0-9$._-]+]]
-func.func @expect_false(%arg : i32) {
+util.func public @expect_false(%arg : i32) {
   // CHECK: check.expect_false(%[[ARG]]) : i32
   check.expect_false(%arg) : i32
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @expect_all_true
 // CHECK-SAME: %[[ARG:[a-zA-Z0-9$._-]+]]
-func.func @expect_all_true(%arg : !hal.buffer_view) {
+util.func public @expect_all_true(%arg : !hal.buffer_view) {
   // CHECK: check.expect_all_true(%[[ARG]]) : !hal.buffer_view
   check.expect_all_true(%arg) : !hal.buffer_view
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @expect_all_true_tensor
 // CHECK-SAME: %[[ARG:[a-zA-Z0-9$._-]+]]
-func.func @expect_all_true_tensor(%arg : tensor<2x2xi32>) {
+util.func public @expect_all_true_tensor(%arg : tensor<2x2xi32>) {
   // CHECK: check.expect_all_true(%[[ARG]]) : tensor<2x2xi32>
   check.expect_all_true(%arg) : tensor<2x2xi32>
-  return
+  util.return
 }
 
 // -----
@@ -45,10 +45,10 @@
 // CHECK-LABEL: @expect_eq
 // CHECK-SAME: %[[LHS:[a-zA-Z0-9$._-]+]]
 // CHECK-SAME: %[[RHS:[a-zA-Z0-9$._-]+]]
-func.func @expect_eq(%lhs : !hal.buffer_view, %rhs : !hal.buffer_view) {
+util.func public @expect_eq(%lhs : !hal.buffer_view, %rhs : !hal.buffer_view) {
   // CHECK: check.expect_eq(%[[LHS]], %[[RHS]]) : !hal.buffer_view
   check.expect_eq(%lhs, %rhs) : !hal.buffer_view
-  return
+  util.return
 }
 
 // -----
@@ -56,20 +56,20 @@
 // CHECK-LABEL: @expect_eq_tensor
 // CHECK-SAME: %[[LHS:[a-zA-Z0-9$._-]+]]
 // CHECK-SAME: %[[RHS:[a-zA-Z0-9$._-]+]]
-func.func @expect_eq_tensor(%lhs : tensor<2x2xi32>, %rhs : tensor<2x2xi32>) {
+util.func public @expect_eq_tensor(%lhs : tensor<2x2xi32>, %rhs : tensor<2x2xi32>) {
   // CHECK: check.expect_eq(%[[LHS]], %[[RHS]]) : tensor<2x2xi32>
   check.expect_eq(%lhs, %rhs) : tensor<2x2xi32>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @expect_eq_const
 // CHECK-SAME: %[[LHS:[a-zA-Z0-9$._-]+]]
-func.func @expect_eq_const(%lhs : tensor<2x2xi32>) {
+util.func public @expect_eq_const(%lhs : tensor<2x2xi32>) {
   // CHECK: check.expect_eq_const(%[[LHS]], dense<1> : tensor<2x2xi32>) : tensor<2x2xi32>
   check.expect_eq_const(%lhs, dense<1> : tensor<2x2xi32>) : tensor<2x2xi32>
-  return
+  util.return
 }
 
 // -----
@@ -77,10 +77,10 @@
 // CHECK-LABEL: @expect_almost_eq
 // CHECK-SAME: %[[LHS:[a-zA-Z0-9$._-]+]]
 // CHECK-SAME: %[[RHS:[a-zA-Z0-9$._-]+]]
-func.func @expect_almost_eq(%lhs : !hal.buffer_view, %rhs : !hal.buffer_view) {
+util.func public @expect_almost_eq(%lhs : !hal.buffer_view, %rhs : !hal.buffer_view) {
   // CHECK: check.expect_almost_eq(%[[LHS]], %[[RHS]]) : !hal.buffer_view
   check.expect_almost_eq(%lhs, %rhs) : !hal.buffer_view
-  return
+  util.return
 }
 
 // -----
@@ -88,18 +88,18 @@
 // CHECK-LABEL: @expect_almost_eq_tensor
 // CHECK-SAME: %[[LHS:[a-zA-Z0-9$._-]+]]
 // CHECK-SAME: %[[RHS:[a-zA-Z0-9$._-]+]]
-func.func @expect_almost_eq_tensor(%lhs : tensor<2x2xf32>, %rhs : tensor<2x2xf32>) {
+util.func public @expect_almost_eq_tensor(%lhs : tensor<2x2xf32>, %rhs : tensor<2x2xf32>) {
   // CHECK: check.expect_almost_eq(%[[LHS]], %[[RHS]]) : tensor<2x2xf32>
   check.expect_almost_eq(%lhs, %rhs) : tensor<2x2xf32>
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @expect_almost_eq_const
 // CHECK-SAME: %[[LHS:[a-zA-Z0-9$._-]+]]
-func.func @expect_almost_eq_const(%lhs : tensor<2x2xf32>) {
+util.func public @expect_almost_eq_const(%lhs : tensor<2x2xf32>) {
   // CHECK: check.expect_almost_eq_const(%[[LHS]], dense<1.000000e+00> : tensor<2x2xf32>) : tensor<2x2xf32>
   check.expect_almost_eq_const(%lhs, dense<1.0> : tensor<2x2xf32>) : tensor<2x2xf32>
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/HALToHALInline/test/buffer_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/HALToHALInline/test/buffer_ops.mlir
index 815bd44..a817404 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/HALToHALInline/test/buffer_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/HALToHALInline/test/buffer_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @buffer_subspan
 // CHECK-SAME: (%[[BUFFER:.+]]: !hal.buffer)
-func.func @buffer_subspan(%buffer: !hal.buffer) -> !hal.buffer {
+util.func public @buffer_subspan(%buffer: !hal.buffer) -> !hal.buffer {
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 100
   %offset = arith.constant 100 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 200
@@ -10,24 +10,24 @@
   // CHECK: %[[SUBSPAN:.+]] = hal_inline.buffer.subspan<%[[BUFFER]] : !hal.buffer>[%[[OFFSET]], %[[LENGTH]]] : !hal.buffer
   %subspan = hal.buffer.subspan<%buffer : !hal.buffer>[%offset, %length] : !hal.buffer
   // CHECK: return %[[SUBSPAN]]
-  return %subspan : !hal.buffer
+  util.return %subspan : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_length
 // CHECK-SAME: (%[[BUFFER:.+]]: !hal.buffer)
-func.func @buffer_length(%buffer: !hal.buffer) -> index {
+util.func public @buffer_length(%buffer: !hal.buffer) -> index {
   // CHECK: hal_inline.buffer.length<%[[BUFFER]] : !hal.buffer> : index
   %length = hal.buffer.length<%buffer : !hal.buffer> : index
-  return %length : index
+  util.return %length : index
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_load
 // CHECK-SAME: (%[[BUFFER:.+]]: !hal.buffer)
-func.func @buffer_load(%buffer: !hal.buffer) -> i32 {
+util.func public @buffer_load(%buffer: !hal.buffer) -> i32 {
   // CHECK-DAG: %[[REL_OFFSET:.+]] = arith.constant 100
   %rel_offset = arith.constant 100 : index
   // CHECK-DAG: %[[STORAGE:.+]] = hal_inline.buffer.storage<%[[BUFFER:.+]] : !hal.buffer> : !util.buffer
@@ -35,19 +35,19 @@
   // CHECK: %[[VALUE:.+]] = util.buffer.load %[[STORAGE]][%[[REL_OFFSET]] for {{.+}}] : !util.buffer{%[[LENGTH]]} -> i32
   %value = hal.buffer.load<%buffer : !hal.buffer>[%rel_offset] : i32
   // CHECK-NEXT: return %[[VALUE]]
-  return %value : i32
+  util.return %value : i32
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_store
 // CHECK-SAME: (%[[BUFFER:.+]]: !hal.buffer, %[[VALUE:.+]]: i32)
-func.func @buffer_store(%buffer: !hal.buffer, %value: i32) {
+util.func public @buffer_store(%buffer: !hal.buffer, %value: i32) {
   // CHECK-DAG: %[[REL_OFFSET:.+]] = arith.constant 100
   %rel_offset = arith.constant 100 : index
   // CHECK-DAG: %[[STORAGE:.+]] = hal_inline.buffer.storage<%[[BUFFER:.+]] : !hal.buffer> : !util.buffer
   // CHECK-DAG: %[[LENGTH:.+]] = hal_inline.buffer.length<%[[BUFFER]] : !hal.buffer> : index
   // CHECK: util.buffer.store %[[VALUE]], %[[STORAGE]][%[[REL_OFFSET]] for {{.+}}] : i32 -> !util.buffer{%[[LENGTH]]}
   hal.buffer.store<%buffer : !hal.buffer>[%rel_offset] value(%value : i32)
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/HALToHALInline/test/buffer_view_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/HALToHALInline/test/buffer_view_ops.mlir
index ca71d84..aaa8937 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/HALToHALInline/test/buffer_view_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/HALToHALInline/test/buffer_view_ops.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --iree-hal-inline-conversion %s | FileCheck %s
 
 // CHECK-LABEL: @buffer_view_create
-func.func @buffer_view_create(%arg0: !hal.buffer, %arg1: index, %arg2: index) -> !hal.buffer_view {
+util.func public @buffer_view_create(%arg0: !hal.buffer, %arg1: index, %arg2: index) -> !hal.buffer_view {
   %c1 = arith.constant 1 : i32
   %c32 = arith.constant 32 : i32
   // CHECK: %view = hal_inline.buffer_view.create
@@ -13,25 +13,25 @@
                                  shape([%arg1, %arg2])
                                  type(%c32)
                                  encoding(%c1) : !hal.buffer_view
-  return %view : !hal.buffer_view
+  util.return %view : !hal.buffer_view
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_view_buffer
-func.func @buffer_view_buffer(%arg0: !hal.buffer_view) -> !hal.buffer {
+util.func public @buffer_view_buffer(%arg0: !hal.buffer_view) -> !hal.buffer {
   // CHECK: %buffer = hal_inline.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
   %buffer = hal.buffer_view.buffer<%arg0 : !hal.buffer_view> : !hal.buffer
-  return %buffer : !hal.buffer
+  util.return %buffer : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @buffer_view_shape_queries
-func.func @buffer_view_shape_queries(%arg0: !hal.buffer_view) -> (index, index) {
+util.func public @buffer_view_shape_queries(%arg0: !hal.buffer_view) -> (index, index) {
   // CHECK: %{{.+}} = hal_inline.buffer_view.rank<%arg0 : !hal.buffer_view> : index
   %0 = hal.buffer_view.rank<%arg0 : !hal.buffer_view> : index
   // CHECK: %{{.+}} = hal_inline.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
   %1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
-  return %0, %1 : index, index
+  util.return %0, %1 : index, index
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/Patterns.cpp b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/Patterns.cpp
index 12c1ba0..d0e643d 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/Patterns.cpp
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/Patterns.cpp

@@ -15,7 +15,6 @@
 #include "iree/compiler/Modules/HAL/Inline/IR/HALInlineDialect.h"
 #include "iree/compiler/Modules/HAL/Inline/IR/HALInlineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir::iree_compiler {
@@ -497,8 +496,9 @@
     llvm::append_range(callArgs, bindingBuffers);
     llvm::append_range(callArgs, bindingOffsets);
     llvm::append_range(callArgs, adaptor.getResourceLengths());
-    rewriter.replaceOpWithNewOp<func::CallOp>(dispatchOp, callee, TypeRange{},
-                                              callArgs);
+    rewriter.replaceOpWithNewOp<IREE::Util::CallOp>(
+        dispatchOp, TypeRange{}, callee.getLeafReference(), callArgs,
+        /*tied_operands=*/ArrayAttr{});
     return success();
   }
 };
@@ -516,11 +516,12 @@
                                                 newResultTypes))) {
       return rewriter.notifyMatchFailure(funcOp, "failed to convert types");
     }
-    auto newOp = rewriter.replaceOpWithNewOp<func::FuncOp>(
+    auto newOp = rewriter.replaceOpWithNewOp<IREE::Util::FuncOp>(
         funcOp, funcOp.getName(),
         rewriter.getFunctionType(newArgTypes, newResultTypes),
-        funcOp.getSymVisibilityAttr(), funcOp.getAllArgAttrs(),
-        funcOp.getAllResultAttrs());
+        /*tied_operands=*/ArrayAttr{}, funcOp.getSymVisibilityAttr(),
+        funcOp.getAllArgAttrs(), funcOp.getAllResultAttrs(),
+        IREE::Util::InliningPolicyAttrInterface{});
     newOp->setDialectAttrs(funcOp->getDialectAttrs());
     return success();
   }
@@ -561,8 +562,9 @@
       llvm::append_range(resultTypes, convertedTypes);
     }
 
-    rewriter.replaceOpWithNewOp<func::CallOp>(callOp, callOp.getCalleeAttr(),
-                                              resultTypes, operands);
+    rewriter.replaceOpWithNewOp<IREE::Util::CallOp>(
+        callOp, resultTypes, callOp.getCallee(), operands,
+        /*tied_operands=*/ArrayAttr{});
     return success();
   }
 };

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/cmd_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/cmd_ops.mlir
index cf6ff97..8c3d022 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/cmd_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/cmd_ops.mlir

@@ -4,7 +4,7 @@
 // assume coherent memory.
 
 // CHECK-LABEL: @cmdMemoryControl
-func.func @cmdMemoryControl(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
+util.func public @cmdMemoryControl(%arg0: !stream.resource<transient>, %arg1: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %fence = stream.cmd.execute with(%arg0 as %arg2: !stream.resource<transient>{%arg1}) {
@@ -13,14 +13,14 @@
     stream.cmd.discard %arg2[%c0 for %c128] : !stream.resource<transient>{%arg1}
   } => !stream.timepoint
   // CHECK: return %c0
-  return %fence : !stream.timepoint
+  util.return %fence : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdFill
 // CHECK-SAME: (%[[TARGET:.+]]: !util.buffer, %[[TARGET_SIZE:.+]]: index)
-func.func @cmdFill(%target: !stream.resource<transient>, %target_size: index) -> !stream.timepoint {
+util.func public @cmdFill(%target: !stream.resource<transient>, %target_size: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 128
   %length = arith.constant 128 : index
@@ -31,14 +31,14 @@
     stream.cmd.fill %value, %target_inner[%c0 for %length] : i32 -> !stream.resource<transient>{%target_size}
   } => !stream.timepoint
   // CHECK: return %c0
-  return %fence : !stream.timepoint
+  util.return %fence : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdCopy
 // CHECK-SAME: (%[[SRC:.+]]: !util.buffer, %[[SRC_SIZE:.+]]: index, %[[DST:.+]]: !util.buffer, %[[DST_SIZE:.+]]: index)
-func.func @cmdCopy(%src: !stream.resource<transient>, %src_size: index,
+util.func public @cmdCopy(%src: !stream.resource<transient>, %src_size: index,
                    %dst: !stream.resource<staging>, %dst_size: index) -> !stream.timepoint {
   // CHECK-DAG: %[[SRC_OFFSET:.+]] = arith.constant 100
   %src_offset = arith.constant 100 : index
@@ -52,13 +52,13 @@
     stream.cmd.copy %src_inner[%src_offset], %dst_inner[%dst_offset], %length : !stream.resource<transient>{%src_size} -> !stream.resource<staging>{%dst_size}
   } => !stream.timepoint
   // CHECK: return %c0
-  return %fence : !stream.timepoint
+  util.return %fence : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @cmdExecute
-func.func @cmdExecute(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: !stream.timepoint) -> !stream.timepoint {
+util.func public @cmdExecute(%arg0: !stream.resource<transient>, %arg1: index, %arg2: !stream.resource<staging>, %arg3: index, %arg4: !stream.timepoint) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %fence = stream.cmd.execute await(%arg4) => with(%arg0 as %arg5: !stream.resource<transient>{%arg1}, %arg2 as %arg6: !stream.resource<staging>{%arg3}) {
@@ -78,13 +78,13 @@
     }
   } => !stream.timepoint
   // CHECK: return %c0
-  return %fence : !stream.timepoint
+  util.return %fence : !stream.timepoint
 }
 
 // -----
 
 // Provided by the iree-hal-inline-executables pass:
-func.func private @__dispatch_ex_dispatch(
+util.func private @__dispatch_ex_dispatch(
     index, index,                 // workload[2]
     i32, i32,                     // pushConstants[2]
     !util.buffer, !util.buffer,   // bindingBuffers[2]
@@ -97,7 +97,7 @@
 // CHECK-LABEL: @cmdDispatch
 // CHECK-SAME: (%[[BUFFER0:.+]]: !util.buffer, %[[BUFFER0_SIZE:.+]]: index,
 // CHECK-SAME:  %[[BUFFER1:.+]]: !hal.buffer, %[[BUFFER1_SIZE:.+]]: index)
-func.func @cmdDispatch(%buffer0: !stream.resource<transient>, %buffer0_size: index,
+util.func public @cmdDispatch(%buffer0: !stream.resource<transient>, %buffer0_size: index,
                        %buffer1: !stream.resource<external>, %buffer1_size: index) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -112,7 +112,7 @@
   %fence = stream.cmd.execute with(%buffer0 as %buffer0_inner: !stream.resource<transient>{%buffer0_size},
                                    %buffer1 as %buffer1_inner: !stream.resource<external>{%buffer1_size}) {
     // CHECK: %[[BUFFER1_STORAGE:.+]] = hal_inline.buffer.storage<%[[BUFFER1]]
-    // CHECK: call @__dispatch_ex_dispatch(
+    // CHECK: util.call @__dispatch_ex_dispatch(
     // CHECK-SAME: %c1, %c2,
     // CHECK-SAME: %c4_i32, %c5_i32,
     // CHECK-SAME: %[[BUFFER0]], %[[BUFFER1_STORAGE]],
@@ -127,7 +127,7 @@
     }
   } => !stream.timepoint
   // CHECK: return %c0
-  return %fence : !stream.timepoint
+  util.return %fence : !stream.timepoint
 }
 
 // -----
@@ -136,11 +136,11 @@
 // Note that we get a buffer + offset + length for each resource but unlike the
 // full HAL path there's no command buffer passed in.
 
-// CHECK: func.func private @cmdFunc(!util.buffer, index, index, i32, !util.buffer, index, index, !custom.type, !util.buffer, index, index)
+// CHECK: util.func private @cmdFunc(%arg0: !util.buffer, %arg1: index, %arg2: index, %arg3: i32, %arg4: !util.buffer, %arg5: index, %arg6: index, %arg7: !custom.type, %arg8: !util.buffer, %arg9: index, %arg10: index)
 stream.cmd.func private @cmdFunc(%arg0[%arg1 for %arg2]: !stream.resource<*>, %arg3: i32, %arg4[%arg5 for %arg6]: !stream.resource<*>, %arg7: !custom.type, %arg8[%arg9 for %arg10]: !stream.resource<*>)
 
 // CHECK-LABEL: @cmdCall
-func.func @cmdCall(%arg0: !stream.resource<external>, %arg1: i32, %arg2: !stream.resource<transient>, %arg3: !custom.type, %arg4: !stream.resource<transient>) -> !stream.timepoint {
+util.func public @cmdCall(%arg0: !stream.resource<external>, %arg1: i32, %arg2: !stream.resource<transient>, %arg3: !custom.type, %arg4: !stream.resource<transient>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   // CHECK-DAG: %[[SIZE0:.+]] = arith.constant 100
   %size0 = arith.constant 100 : index
@@ -150,9 +150,9 @@
   %size2 = arith.constant 102 : index
   // CHECK-DAG: %[[ARG0_STORAGE:.+]] = hal_inline.buffer.storage<%arg0 : !hal.buffer> : !util.buffer
   %timepoint = stream.cmd.execute with(%arg0 as %stream0: !stream.resource<external>{%size0}, %arg2 as %stream1: !stream.resource<transient>{%size1}, %arg4 as %stream2: !stream.resource<transient>{%size2}) {
-    // CHECK: call @cmdFunc(%[[ARG0_STORAGE]], %c0, %[[SIZE0]], %arg1, %arg2, %c0, %[[SIZE1]], %arg3, %arg4, %c0, %[[SIZE2]]) :
+    // CHECK: util.call @cmdFunc(%[[ARG0_STORAGE]], %c0, %[[SIZE0]], %arg1, %arg2, %c0, %[[SIZE1]], %arg3, %arg4, %c0, %[[SIZE2]]) :
     // CHECK-SAME: (!util.buffer, index, index, i32, !util.buffer, index, index, !custom.type, !util.buffer, index, index) -> ()
     stream.cmd.call @cmdFunc(ro %stream0[%c0 for %size0], %arg1, rw %stream1[%c0 for %size1], %arg3, wo %stream2[%c0 for %size2]) : (!stream.resource<external>{%size0}, i32, !stream.resource<transient>{%size1}, !custom.type, !stream.resource<transient>{%size2}) -> ()
   } => !stream.timepoint
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/debug_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/debug_ops.mlir
index 24d3565..4ede010 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/debug_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/debug_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @tensorTrace
 // CHECK-SAME: (%[[TENSOR0_STORAGE:.+]]: !util.buffer, %[[TENSOR0_SIZE:.+]]: index, %[[TENSOR1_STORAGE:.+]]: !util.buffer, %[[TENSOR1_SIZE:.+]]: index, %[[TENSOR1_DIM0:.+]]: index)
-func.func @tensorTrace(%tensor0: !stream.resource<staging>, %tensor0_size: index, %tensor1: !stream.resource<staging>, %tensor1_size: index, %tensor1_dim0: index) {
+util.func public @tensorTrace(%tensor0: !stream.resource<staging>, %tensor0_size: index, %tensor1: !stream.resource<staging>, %tensor1_size: index, %tensor1_dim0: index) {
   // CHECK-DAG: %[[TENSOR0_BUFFER:.+]] = hal_inline.buffer.wrap source(%[[TENSOR0_STORAGE]] : !util.buffer)[%c0, %[[TENSOR0_SIZE]]] : !hal.buffer
   // CHECK-DAG: %[[TENSOR0:.+]] = hal_inline.buffer_view.create buffer(%[[TENSOR0_BUFFER]] : !hal.buffer)[%c0{{.*}}, %[[TENSOR0_SIZE]]] shape([%c5, %c3])
   // CHECK-DAG: %[[TENSOR1_BUFFER:.+]] = hal_inline.buffer.wrap source(%[[TENSOR1_STORAGE]] : !util.buffer)[%c0, %[[TENSOR1_SIZE]]] : !hal.buffer
@@ -12,5 +12,5 @@
     %tensor0 : tensor<5x3xf32> in !stream.resource<staging>{%tensor0_size},
     %tensor1 : tensor<?x5xf32>{%tensor1_dim0} in !stream.resource<staging>{%tensor1_size}
   ]
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/file_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/file_ops.mlir
index 69ce43f..58ab268 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/file_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/file_ops.mlir

@@ -7,7 +7,7 @@
 
 // CHECK-LABEL: @file_constant
 //  CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer) -> !util.buffer
-func.func @file_constant(%buffer: !util.buffer) -> !stream.file {
+util.func public @file_constant(%buffer: !util.buffer) -> !stream.file {
   %c0 = arith.constant 0 : index
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -15,28 +15,28 @@
   // CHECK: %[[SPAN:.+]] = util.buffer.subspan %[[BUFFER]][%c100] : !util.buffer{%c300} -> !util.buffer{%c200}
   %file = stream.file.constant %buffer[%c100 for %c200] : !util.buffer{%c300} -> !stream.file
   // CHECK: return %[[SPAN]]
-  return %file : !stream.file
+  util.return %file : !stream.file
 }
 
 // -----
 
 // CHECK-LABEL: @file_read
 //  CHECK-SAME: (%[[WAIT:.+]]: i64, %[[FILE:.+]]: !util.buffer, %[[RESOURCE:.+]]: !util.buffer)
-func.func @file_read(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) -> !stream.timepoint {
+util.func public @file_read(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %offset = arith.constant 100 : i64
   %c1088 = arith.constant 1088 : index
   // CHECK: %[[SIGNAL:.+]] = arith.constant 0 : i64
   %signal = stream.file.read await(%wait) => %file[%offset], %resource[%c0], %c1088 : !stream.file -> !stream.resource<variable>{%c1088} => !stream.timepoint
   // CHECK: return %[[SIGNAL]]
-  return %signal : !stream.timepoint
+  util.return %signal : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @file_write
 //  CHECK-SAME: (%[[WAIT:.+]]: i64, %[[FILE:.+]]: !util.buffer, %[[RESOURCE:.+]]: !util.buffer)
-func.func @file_write(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) -> !stream.timepoint {
+util.func public @file_write(%wait: !stream.timepoint, %file: !stream.file, %resource: !stream.resource<variable>) -> !stream.timepoint {
   %c0 = arith.constant 0 : index
   %offset = arith.constant 100 : i64
   %c1088 = arith.constant 1088 : index
@@ -45,14 +45,14 @@
   // CHECK: %[[SIGNAL:.+]] = arith.constant 0 : i64
   %signal = stream.file.write await(%wait) => %resource[%c0], %file[%offset], %c1088 : !stream.resource<variable>{%c1088} -> !stream.file => !stream.timepoint
   // CHECK: return %[[SIGNAL]]
-  return %signal : !stream.timepoint
+  util.return %signal : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @variable_read
 //  CHECK-SAME: (%[[WAIT:.+]]: i64) -> (!util.buffer, i64)
-func.func @variable_read(%wait: !stream.timepoint) -> (!stream.resource<variable>, !stream.timepoint) {
+util.func public @variable_read(%wait: !stream.timepoint) -> (!stream.resource<variable>, !stream.timepoint) {
   %c0 = arith.constant 0 : index
   %c16 = arith.constant 16 : index
   %c32 = arith.constant 32 : index
@@ -69,5 +69,5 @@
   // CHECK: %[[SIGNAL:.+]] = arith.constant 0 : i64
   %signal = stream.file.read await(%wait) => %file[%c100], %resource[%c32], %c32 : !stream.file -> !stream.resource<variable>{%c64} => !stream.timepoint
   // CHECK: return %[[STORAGE]], %[[SIGNAL]]
-  return %resource, %signal : !stream.resource<variable>, !stream.timepoint
+  util.return %resource, %signal : !stream.resource<variable>, !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/resource_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/resource_ops.mlir
index 9de6d8f..f6326b5 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/resource_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/resource_ops.mlir

@@ -2,35 +2,35 @@
 
 // CHECK-LABEL: @resourceAlloc
 // CHECK-SAME: (%[[LENGTH:.+]]: index)
-func.func @resourceAlloc(%length: index) -> !stream.resource<transient> {
+util.func public @resourceAlloc(%length: index) -> !stream.resource<transient> {
   // CHECK: %[[BUFFER:.+]], %[[STORAGE:.+]] = hal_inline.buffer.allocate alignment(%c64) : !hal.buffer{%[[LENGTH]]}
   %result = stream.resource.alloc uninitialized : !stream.resource<transient>{%length}
   // CHECK: return %[[STORAGE]]
-  return %result : !stream.resource<transient>
+  util.return %result : !stream.resource<transient>
 }
 
 // -----
 
 // CHECK-LABEL: @resourceAlloca
 // CHECK-SAME: (%[[LENGTH:.+]]: index)
-func.func @resourceAlloca(%length: index) -> (!stream.resource<staging>, !stream.timepoint) {
+util.func public @resourceAlloca(%length: index) -> (!stream.resource<staging>, !stream.timepoint) {
   // CHECK: %[[BUFFER:.+]], %[[STORAGE:.+]] = hal_inline.buffer.allocate alignment(%c64) : !hal.buffer{%[[LENGTH]]}
   %0:2 = stream.resource.alloca uninitialized : !stream.resource<staging>{%length} => !stream.timepoint
   // CHECK: %[[IMMEDIATE:.+]] = arith.constant 0 : i64
   // CHECK: return %[[STORAGE]], %[[IMMEDIATE]]
-  return %0#0, %0#1 : !stream.resource<staging>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<staging>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @resourceAllocaAwait
 // CHECK-SAME: (%[[LENGTH:.+]]: index, %[[TIMEPOINT:.+]]: i64)
-func.func @resourceAllocaAwait(%length: index, %await_timepoint: !stream.timepoint) -> (!stream.resource<staging>, !stream.timepoint) {
+util.func public @resourceAllocaAwait(%length: index, %await_timepoint: !stream.timepoint) -> (!stream.resource<staging>, !stream.timepoint) {
   // CHECK: %[[BUFFER:.+]], %[[STORAGE:.+]] = hal_inline.buffer.allocate alignment(%c64) : !hal.buffer{%[[LENGTH]]}
   %0:2 = stream.resource.alloca uninitialized await(%await_timepoint) => !stream.resource<staging>{%length} => !stream.timepoint
   // CHECK: %[[IMMEDIATE:.+]] = arith.constant 0 : i64
   // CHECK: return %[[STORAGE]], %[[IMMEDIATE]]
-  return %0#0, %0#1 : !stream.resource<staging>, !stream.timepoint
+  util.return %0#0, %0#1 : !stream.resource<staging>, !stream.timepoint
 }
 
 // -----
@@ -38,11 +38,11 @@
 // NOTE: we don't do anything with deallocs today but could add a discard op.
 
 // CHECK-LABEL: @resourceDealloca
-func.func @resourceDealloca(%arg0: index, %arg1: !stream.resource<staging>, %arg2: !stream.timepoint) -> !stream.timepoint {
+util.func public @resourceDealloca(%arg0: index, %arg1: !stream.resource<staging>, %arg2: !stream.timepoint) -> !stream.timepoint {
   %0 = stream.resource.dealloca %arg1 : !stream.resource<staging>{%arg0} => !stream.timepoint
   // CHECK: %[[IMMEDIATE:.+]] = arith.constant 0 : i64
   // CHECK: return %[[IMMEDIATE]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
@@ -50,28 +50,28 @@
 // NOTE: we don't do anything with deallocs today but could add a discard op.
 
 // CHECK-LABEL: @resourceDeallocaAwait
-func.func @resourceDeallocaAwait(%arg0: index, %arg1: !stream.resource<staging>, %arg2: !stream.timepoint) -> !stream.timepoint {
+util.func public @resourceDeallocaAwait(%arg0: index, %arg1: !stream.resource<staging>, %arg2: !stream.timepoint) -> !stream.timepoint {
   %0 = stream.resource.dealloca await(%arg2) => %arg1 : !stream.resource<staging>{%arg0} => !stream.timepoint
   // CHECK: %[[IMMEDIATE:.+]] = arith.constant 0 : i64
   // CHECK: return %[[IMMEDIATE]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @resourceSize
-func.func @resourceSize(%arg0: !stream.resource<transient>) -> index {
+util.func public @resourceSize(%arg0: !stream.resource<transient>) -> index {
   // CHECK: %[[SIZE:.+]] = util.buffer.size %arg0
   %0 = stream.resource.size %arg0 : !stream.resource<transient>
   // CHECK: return %[[SIZE]]
-  return %0 : index
+  util.return %0 : index
 }
 
 // -----
 
 // CHECK-LABEL: @resourceTryMap
 // CHECK-SAME: (%[[SOURCE:.+]]: !util.buffer)
-func.func @resourceTryMap(%source: !util.buffer) -> (i1, !stream.resource<constant>) {
+util.func public @resourceTryMap(%source: !util.buffer) -> (i1, !stream.resource<constant>) {
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 100
   %offset = arith.constant 100 : index
   // CHECK-DAG: %[[LENGTH:.+]] = arith.constant 128
@@ -81,41 +81,41 @@
   // CHECK-DAG: %[[DID_MAP:.+]] = arith.constant true
   %did_map, %mapping = stream.resource.try_map %source[%offset] : !util.buffer -> i1, !stream.resource<constant>{%length}
   // CHECK: return %[[DID_MAP]], %[[MAPPING]]
-  return %did_map, %mapping : i1, !stream.resource<constant>
+  util.return %did_map, %mapping : i1, !stream.resource<constant>
 }
 
 // -----
 
 // CHECK-LABEL: @resourceLoad
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer, %[[BUFFER_SIZE:.+]]: index, %[[OFFSET:.+]]: index)
-func.func @resourceLoad(%resource: !stream.resource<staging>, %resource_size: index, %offset: index) -> i32 {
+util.func public @resourceLoad(%resource: !stream.resource<staging>, %resource_size: index, %offset: index) -> i32 {
   // CHECK: %[[VALUE:.+]] = util.buffer.load %[[BUFFER]][%[[OFFSET]] for {{.+}}] : !util.buffer{%[[BUFFER_SIZE]]} -> i32
   %0 = stream.resource.load %resource[%offset] : !stream.resource<staging>{%resource_size} -> i32
   // CHECK: return %[[VALUE]]
-  return %0 : i32
+  util.return %0 : i32
 }
 
 // -----
 
 // CHECK-LABEL: @resourceStore
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer, %[[BUFFER_SIZE:.+]]: index, %[[OFFSET:.+]]: index)
-func.func @resourceStore(%resource: !stream.resource<staging>, %resource_size: index, %offset: index) {
+util.func public @resourceStore(%resource: !stream.resource<staging>, %resource_size: index, %offset: index) {
   // CHECK-DAG: %[[VALUE:.+]] = arith.constant 123
   %value = arith.constant 123 : i32
   // CHECK: util.buffer.store %[[VALUE]], %[[BUFFER]][%[[OFFSET]] for {{.+}}] : i32 -> !util.buffer{%[[BUFFER_SIZE]]}
   stream.resource.store %value, %resource[%offset] : i32 -> !stream.resource<staging>{%resource_size}
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @resourceSubview
 // CHECK-SAME: (%[[BUFFER:.+]]: !util.buffer, %[[BUFFER_SIZE:.+]]: index)
-func.func @resourceSubview(%resource: !stream.resource<transient>, %resource_size: index) -> !stream.resource<transient> {
+util.func public @resourceSubview(%resource: !stream.resource<transient>, %resource_size: index) -> !stream.resource<transient> {
   %c128 = arith.constant 128 : index
   %c256 = arith.constant 256 : index
   // CHECK: %[[SUBSPAN:.+]] = util.buffer.subspan %[[BUFFER]][%c128] : !util.buffer{%[[BUFFER_SIZE]]} -> !util.buffer{%c256}
   %0 = stream.resource.subview %resource[%c128] : !stream.resource<transient>{%resource_size} -> !stream.resource<transient>{%c256}
   // CHECK: return %[[SUBSPAN]]
-  return %0 : !stream.resource<transient>
+  util.return %0 : !stream.resource<transient>
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/timepoint_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/timepoint_ops.mlir
index aef6a09..67793ea 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/timepoint_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/timepoint_ops.mlir

@@ -6,43 +6,43 @@
 // CHECK-LABEL: @rwTimepoint
 // CHECK-SAME: = 0 : i64
 util.global private mutable @rwTimepoint = #stream.timepoint<immediate>
-// CHECK: func.func @globalTimepoint(%arg0: i64) -> i64
-func.func @globalTimepoint(%arg0: !stream.timepoint) -> !stream.timepoint {
+// CHECK: util.func public @globalTimepoint(%arg0: i64) -> i64
+util.func public @globalTimepoint(%arg0: !stream.timepoint) -> !stream.timepoint {
   // CHECK: util.global.store %arg0, @rwTimepoint
   util.global.store %arg0, @rwTimepoint : !stream.timepoint
   // CHECK: %[[VALUE:.+]] = util.global.load @rwTimepoint
   %value = util.global.load @rwTimepoint : !stream.timepoint
   // CHECK: return %[[VALUE]]
-  return %value : !stream.timepoint
+  util.return %value : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointImmediate
-func.func @timepointImmediate() -> !stream.timepoint {
+util.func public @timepointImmediate() -> !stream.timepoint {
   // CHECK: %[[TIMEPOINT:.+]] = arith.constant 0
   %0 = stream.timepoint.immediate => !stream.timepoint
   // CHECK: return %[[TIMEPOINT]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointJoin
-func.func @timepointJoin(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
+util.func public @timepointJoin(%arg0: !stream.timepoint, %arg1: !stream.timepoint) -> !stream.timepoint {
   // CHECK: %[[TIMEPOINT:.+]] = arith.constant 0
   %0 = stream.timepoint.join max(%arg0, %arg1) => !stream.timepoint
   // CHECK: return %[[TIMEPOINT]]
-  return %0 : !stream.timepoint
+  util.return %0 : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @timepointAwait
-func.func @timepointAwait(%arg0: !stream.timepoint, %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>) -> (!stream.resource<staging>, !stream.resource<*>) {
+util.func public @timepointAwait(%arg0: !stream.timepoint, %arg1: !stream.resource<staging>, %arg2: !stream.resource<*>) -> (!stream.resource<staging>, !stream.resource<*>) {
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
   %0:2 = stream.timepoint.await %arg0 => %arg1, %arg2 : !stream.resource<staging>{%c100}, !stream.resource<*>{%c200}
   // CHECK: return %arg1, %arg2
-  return %0#0, %0#1 : !stream.resource<staging>, !stream.resource<*>
+  util.return %0#0, %0#1 : !stream.resource<staging>, !stream.resource<*>
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/transfer_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/transfer_ops.mlir
index 2bbd473..8b6562a 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/transfer_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/test/transfer_ops.mlir

@@ -2,10 +2,10 @@
 
 // CHECK-LABEL: @tensorImportBuffer
 // CHECK-SAME: (%[[BUFFER:.+]]: !hal.buffer, %[[RESOURCE_SIZE:.+]]: index, %[[DIM:.+]]: index) -> !hal.buffer
-func.func @tensorImportBuffer(%buffer: !hal.buffer, %resource_size: index, %dim: index) -> !stream.resource<external> {
+util.func public @tensorImportBuffer(%buffer: !hal.buffer, %resource_size: index, %dim: index) -> !stream.resource<external> {
   %0 = stream.tensor.import %buffer : !hal.buffer -> tensor<?x5xf32>{%dim} in !stream.resource<external>{%resource_size}
   // CHECK: return %[[BUFFER]]
-  return %0 : !stream.resource<external>
+  util.return %0 : !stream.resource<external>
 }
 
 // -----
@@ -16,28 +16,28 @@
 
 // CHECK-LABEL: @tensorImportBufferView
 // CHECK-SAME: (%[[BUFFER_VIEW:.+]]: !hal.buffer_view, %[[RESOURCE_SIZE:.+]]: index, %[[DIM:.+]]: index) -> !hal.buffer
-func.func @tensorImportBufferView(%buffer_view: !hal.buffer_view, %resource_size: index, %dim: index) -> !stream.resource<external> {
+util.func public @tensorImportBufferView(%buffer_view: !hal.buffer_view, %resource_size: index, %dim: index) -> !stream.resource<external> {
   // CHECK: %[[BUFFER:.+]] = hal_inline.buffer_view.buffer<%[[BUFFER_VIEW]] : !hal.buffer_view> : !hal.buffer
   %0 = stream.tensor.import %buffer_view : !hal.buffer_view -> tensor<?x5xf32>{%dim} in !stream.resource<external>{%resource_size}
   // CHECK: return %[[BUFFER]]
-  return %0 : !stream.resource<external>
+  util.return %0 : !stream.resource<external>
 }
 
 // -----
 
 // CHECK-LABEL: @tensorExportBuffer
 // CHECK-SAME: (%[[BUFFER:.+]]: !hal.buffer, %[[RESOURCE_SIZE:.+]]: index, %[[DIM:.+]]: index) -> !hal.buffer
-func.func @tensorExportBuffer(%resource: !stream.resource<external>, %resource_size: index, %dim: index) -> !hal.buffer {
+util.func public @tensorExportBuffer(%resource: !stream.resource<external>, %resource_size: index, %dim: index) -> !hal.buffer {
   %0 = stream.tensor.export %resource : tensor<?x1x10xf32>{%dim} in !stream.resource<external>{%resource_size} -> !hal.buffer
   // CHECK: return %[[BUFFER]]
-  return %0 : !hal.buffer
+  util.return %0 : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @tensorExportBufferView
 // CHECK-SAME: (%[[BUFFER:.+]]: !hal.buffer, %[[RESOURCE_SIZE:.+]]: index, %[[DIM:.+]]: index) -> !hal.buffer
-func.func @tensorExportBufferView(%resource: !stream.resource<external>, %resource_size: index, %dim: index) -> !hal.buffer_view {
+util.func public @tensorExportBufferView(%resource: !stream.resource<external>, %resource_size: index, %dim: index) -> !hal.buffer_view {
   // CHECK: %[[BUFFER_VIEW:.+]] = hal_inline.buffer_view.create
   // CHECK-SAME: buffer(%[[BUFFER]] : !hal.buffer)
   // CHECK-SAME: shape([%[[DIM]], %c1, %c10])
@@ -45,5 +45,5 @@
   // CHECK-SAME: encoding(%c1_i32)
   %0 = stream.tensor.export %resource : tensor<?x1x10xf32>{%dim} in !stream.resource<external>{%resource_size} -> !hal.buffer_view
   // CHECK: return %[[BUFFER_VIEW]]
-  return %0 : !hal.buffer_view
+  util.return %0 : !hal.buffer_view
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/IR/test/buffer_folding.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/IR/test/buffer_folding.mlir
index 0d0ad9f..f02fe9e 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/IR/test/buffer_folding.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/IR/test/buffer_folding.mlir

@@ -1,34 +1,34 @@
 // RUN: iree-opt --split-input-file --canonicalize -cse %s | iree-opt --split-input-file | FileCheck %s
 
-// CHECK-LABEL: func @FoldBufferLengthOp
+// CHECK-LABEL: @FoldBufferLengthOp
 // CHECK-SAME: (%[[LENGTH:.+]]: index)
-func.func @FoldBufferLengthOp(%length: index) -> index {
+util.func public @FoldBufferLengthOp(%length: index) -> index {
   %c64 = arith.constant 64 : index
   %buffer, %storage = hal_inline.buffer.allocate alignment(%c64) : !hal.buffer{%length} in !util.buffer
   // CHECK-NOT: hal_inline.buffer.length
   %queried_length = hal_inline.buffer.length<%buffer : !hal.buffer> : index
   // CHECK: return %[[LENGTH]]
-  return %queried_length : index
+  util.return %queried_length : index
 }
 
 // -----
 
-// CHECK-LABEL: func @FoldBufferStorageOp
-func.func @FoldBufferStorageOp(%length: index) -> !util.buffer {
+// CHECK-LABEL: @FoldBufferStorageOp
+util.func public @FoldBufferStorageOp(%length: index) -> !util.buffer {
   %c64 = arith.constant 64 : index
   // CHECK: %[[BUFFER:.+]], %[[STORAGE:.+]] = hal_inline.buffer.allocate
   %buffer, %storage = hal_inline.buffer.allocate alignment(%c64) : !hal.buffer{%length} in !util.buffer
   // CHECK-NOT: hal_inline.buffer.storage
   %queried_storage = hal_inline.buffer.storage<%buffer : !hal.buffer> : !util.buffer
   // CHECK: return %[[STORAGE]]
-  return %queried_storage : !util.buffer
+  util.return %queried_storage : !util.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @FoldBufferViewCreateSubspan
 // CHECK-SAME: (%[[BASE_BUFFER:.+]]: !hal.buffer, %[[SUBSPAN_OFFSET:.+]]: index, %[[SUBSPAN_LENGTH:.+]]: index)
-func.func @FoldBufferViewCreateSubspan(%base_buffer: !hal.buffer, %subspan_offset: index, %subspan_length: index) -> !hal.buffer_view {
+util.func public @FoldBufferViewCreateSubspan(%base_buffer: !hal.buffer, %subspan_offset: index, %subspan_length: index) -> !hal.buffer_view {
   %subspan = hal_inline.buffer.subspan<%base_buffer : !hal.buffer>[%subspan_offset, %subspan_length] : !hal.buffer
   // CHECK-DAG: %[[VIEW_OFFSET:.+]] = arith.constant 512
   %view_offset = arith.constant 512 : index
@@ -44,14 +44,14 @@
                                  shape([%dim0])
                                  type(%type)
                                  encoding(%encoding) : !hal.buffer_view
-  return %view : !hal.buffer_view
+  util.return %view : !hal.buffer_view
 }
 
 // -----
 
-// CHECK-LABEL: func @SkipBufferViewBufferOp
+// CHECK-LABEL: @SkipBufferViewBufferOp
 // CHECK-SAME: (%[[BUFFER:.+]]: !hal.buffer)
-func.func @SkipBufferViewBufferOp(%buffer: !hal.buffer) -> !hal.buffer {
+util.func public @SkipBufferViewBufferOp(%buffer: !hal.buffer) -> !hal.buffer {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : i32
   %c10 = arith.constant 10 : index
@@ -64,5 +64,5 @@
                                         encoding(%c1) : !hal.buffer_view
   %view_buffer = hal_inline.buffer_view.buffer<%view : !hal.buffer_view> : !hal.buffer
   // CHECK: return %[[BUFFER]]
-  return %view_buffer : !hal.buffer
+  util.return %view_buffer : !hal.buffer
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/InlineExecutables.cpp b/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/InlineExecutables.cpp
index 95173a6..f317326 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/InlineExecutables.cpp
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/InlineExecutables.cpp

@@ -115,7 +115,7 @@
           innerModuleBuilder.getFunctionType(inputTypes, {});
 
       // Create the function and insert into the module.
-      auto dispatchFuncOp = func::FuncOp::create(
+      auto dispatchFuncOp = IREE::Util::FuncOp::create(
           exportOp.getLoc(),
           ("__dispatch_" + executableOp.getName() + "_" + exportOp.getName())
               .str(),
@@ -127,7 +127,7 @@
 
       // Build the dispatch function by calling the target function in a loop.
       auto bodyFuncOp =
-          innerSymbolTable.lookup<func::FuncOp>(exportOp.getName());
+          innerSymbolTable.lookup<FunctionOpInterface>(exportOp.getName());
       if (!bodyFuncOp) {
         return exportOp.emitOpError("missing body function");
       }
@@ -181,7 +181,8 @@
   // about the function signatures.
   LogicalResult
   rewriteWorkgroupSignature(IREE::HAL::PipelineLayoutAttr layoutAttr,
-                            size_t totalBindingCount, func::FuncOp bodyFuncOp) {
+                            size_t totalBindingCount,
+                            FunctionOpInterface bodyFuncOp) {
     auto *entryBlock = &bodyFuncOp.front();
     auto builder = OpBuilder::atBlockBegin(entryBlock);
     auto indexType = builder.getIndexType();
@@ -333,8 +334,9 @@
   //    workgroup_count_x, workgroup_count_y, workgroup_count_z)
   void buildDispatchFunc(IREE::HAL::ExecutableExportOp exportOp,
                          IREE::HAL::PipelineLayoutAttr layoutAttr,
-                         size_t totalBindingCount, func::FuncOp bodyFuncOp,
-                         func::FuncOp dispatchFuncOp) {
+                         size_t totalBindingCount,
+                         FunctionOpInterface bodyFuncOp,
+                         FunctionOpInterface dispatchFuncOp) {
     auto loc = exportOp.getLoc();
     auto builder = OpBuilder::atBlockBegin(dispatchFuncOp.addEntryBlock());
     IndexSet indexSet(loc, builder);
@@ -408,8 +410,9 @@
                     [&](OpBuilder &forXBuilder, Location loc, Value ix,
                         ValueRange iters) {
                       workgroupArgs[workgroupXYZOffset + 0] = ix;
-                      forXBuilder.create<func::CallOp>(loc, bodyFuncOp,
-                                                       workgroupArgs);
+                      forXBuilder.create<func::CallOp>(
+                          loc, bodyFuncOp.getNameAttr(),
+                          bodyFuncOp.getResultTypes(), workgroupArgs);
                       forXBuilder.create<scf::YieldOp>(loc);
                     });
                 forYBuilder.create<scf::YieldOp>(loc);
@@ -417,7 +420,7 @@
           forZBuilder.create<scf::YieldOp>(loc);
         });
 
-    builder.create<func::ReturnOp>(loc);
+    builder.create<IREE::Util::ReturnOp>(loc);
   }
 };
 

diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/test/inline_executables.mlir b/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/test/inline_executables.mlir
index 14f4d28..f88d821 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/test/inline_executables.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Transforms/test/inline_executables.mlir

@@ -29,7 +29,7 @@
         util.global.store %buffer_cst, @global_constant : !util.buffer
         util.return
       }
-      func.func @dispatch_0(
+      func.func public @dispatch_0(
           %local_memory: !util.buffer,
           %constants: !util.buffer,
           %bindings: !util.list<!util.buffer>,
@@ -68,7 +68,7 @@
           %scaled = arith.mulf %mul, %constant1_f32 : f32
           util.buffer.store %scaled, %buffer2[%idx for %c4] : f32 -> !util.buffer{%buffer2_size}
         }
-        return
+        func.return
       }
     }
   }
@@ -83,9 +83,9 @@
 // CHECK:   util.global.store %[[CONSTANT]], @global_constant
 
 // Ensures that we properly rename the dispatch function we inline:
-func.func private @dispatch_0()
+util.func private @dispatch_0()
 
-// CHECK-LABEL: func private @dispatch_0_0
+// CHECK-LABEL: func.func private @dispatch_0_0
 // CHECK-SAME: (%[[LOCAL_MEMORY:.+]]: !util.buffer, %[[CONSTANT0:.+]]: i32, %[[CONSTANT1:.+]]: i32,
 // CHECK-SAME:  %[[BINDING0:.+]]: !util.buffer, %[[BINDING1:.+]]: !util.buffer, %[[BINDING2:.+]]: !util.buffer,
 // CHECK-SAME:  %[[X:[a-z0-9]+]]: index, %[[Y:[a-z0-9]+]]: index, %[[Z:[a-z0-9]+]]: index,
@@ -117,7 +117,7 @@
 // CHECK:   util.buffer.store %[[SCALED]], %[[BINDING2]][%[[ELEMENT_OFFSET]] for {{.+}}] : f32 -> !util.buffer{%[[BINDING2_SIZE]]}
 // CHECK: return
 
-// CHECK-LABEL: func private @__dispatch_ex_dispatch_0
+// CHECK-LABEL: util.func private @__dispatch_ex_dispatch_0
 // CHECK-SAME: (%[[WORKLOAD_X:.+]]: index, %[[WORKLOAD_Y:.+]]: index, %[[CONSTANT0:.+]]: i32, %[[CONSTANT1:.+]]: i32,
 // CHECK-SAME:  %[[BINDING0:.+]]: !util.buffer, %[[BINDING1:.+]]: !util.buffer, %[[BINDING2:.+]]: !util.buffer,
 // CHECK-SAME:  %[[OFFSET0:[a-z0-9]+]]: index, %[[OFFSET1:[a-z0-9]+]]: index, %[[OFFSET2:[a-z0-9]+]]: index,
@@ -151,13 +151,13 @@
 // CHECK-SAME:         %[[X]], %[[Y]], %[[Z]],
 // CHECK-SAME:         %[[SIZE_XYZ]], %[[SIZE_XYZ]], %[[SIZE_XYZ]],
 // CHECK-SAME:         %[[COUNT_X]], %[[COUNT_Y]], %[[COUNT_Z]])
-// CHECK:   return
+// CHECK:   util.return
 
 // CHECK-LABEL: @dispatch0
 // CHECK-SAME: (%[[RESOURCE0:.+]]: !stream.resource<constant>,
 // CHECK-SAME:  %[[RESOURCE1:.+]]: !stream.resource<transient>,
 // CHECK-SAME:  %[[RESOURCE2:.+]]: !stream.resource<external>)
-func.func private @dispatch0(%resource0: !stream.resource<constant>, %resource1: !stream.resource<transient>, %resource2: !stream.resource<external>) {
+util.func private @dispatch0(%resource0: !stream.resource<constant>, %resource1: !stream.resource<transient>, %resource2: !stream.resource<external>) {
   %workload_x = arith.constant 1000 : index
   %workload_y = arith.constant 1001 : index
   %constant0 = arith.constant 4 : i32
@@ -186,5 +186,5 @@
       ]
     }
   } => !stream.timepoint
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Loader/Conversion/HALLoaderToVM/test/executable_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Loader/Conversion/HALLoaderToVM/test/executable_ops.mlir
index 8328041..fb5e5f1 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Loader/Conversion/HALLoaderToVM/test/executable_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Loader/Conversion/HALLoaderToVM/test/executable_ops.mlir

@@ -3,13 +3,13 @@
 
 // CHECK-LABEL: @executableLoad
 // CHECK-SAME: (%[[EXECUTABLE_DATA:.+]]: !vm.buffer)
-func.func @executableLoad(%executable_data: !util.buffer) -> !hal.executable {
+util.func public @executableLoad(%executable_data: !util.buffer) -> !hal.executable {
   // CHECK-DAG: %[[CONSTANTS:.+]] = vm.const.ref.zero : !vm.buffer
   // CHECK-DAG: %[[FORMAT_STR:.+]] = vm.rodata.inline {{.+}} : !vm.buffer = "executable_format"
   // CHECK: %[[EXECUTABLE:.+]] = vm.call @hal_loader.executable.load(%[[FORMAT_STR]], %[[EXECUTABLE_DATA]], %[[CONSTANTS]])
   %executable = hal_loader.executable.load format("executable_format") data(%executable_data) : !hal.executable
   // CHECK: return %[[EXECUTABLE]]
-  return %executable : !hal.executable
+  util.return %executable : !hal.executable
 }
 
 // -----
@@ -17,7 +17,7 @@
 // CHECK-LABEL: @executableDispatch
 // CHECK-SAME: (%[[EXECUTABLE:.+]]: !vm.ref<!hal.executable>,
 // CHECK-SAME:  %[[BUFFER0:.+]]: !vm.buffer, %[[BUFFER1:.+]]: !vm.buffer)
-func.func @executableDispatch(%executable: !hal.executable, %buffer0: !util.buffer, %buffer1: !util.buffer) {
+util.func public @executableDispatch(%executable: !hal.executable, %buffer0: !util.buffer, %buffer1: !util.buffer) {
   // CHECK-DAG: %[[COUNT_X:.+]] = vm.const.i32 1000
   %count_x = arith.constant 1000 : index
   // CHECK-DAG: %[[COUNT_Y:.+]] = vm.const.i32 1001
@@ -50,5 +50,5 @@
       // CHECK-SAME: (%[[BUFFER1]], %[[OFFSET1]], %[[LENGTH1]])
       (%buffer1 : !util.buffer)[%offset1, %length1]
     ])
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Loader/Conversion/StreamToHALLoader/test/cmd_ops.mlir b/compiler/src/iree/compiler/Modules/HAL/Loader/Conversion/StreamToHALLoader/test/cmd_ops.mlir
index 7e5471d..08f834a 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Loader/Conversion/StreamToHALLoader/test/cmd_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Loader/Conversion/StreamToHALLoader/test/cmd_ops.mlir

@@ -32,7 +32,7 @@
 // CHECK-LABEL: @cmdDispatch
 // CHECK-SAME: (%[[BUFFER0:.+]]: !util.buffer, %[[BUFFER0_SIZE:.+]]: index,
 // CHECK-SAME:  %[[BUFFER1:.+]]: !hal.buffer, %[[BUFFER1_SIZE:.+]]: index)
-func.func @cmdDispatch(%buffer0: !stream.resource<transient>, %buffer0_size: index,
+util.func public @cmdDispatch(%buffer0: !stream.resource<transient>, %buffer0_size: index,
                        %buffer1: !stream.resource<external>, %buffer1_size: index) -> !stream.timepoint {
   // (ends up by the dispatch below)
   %workload_x = arith.constant 1000 : index
@@ -87,5 +87,5 @@
     }
   } => !stream.timepoint
   // CHECK: return %c0
-  return %fence : !stream.timepoint
+  util.return %fence : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Loader/IR/test/dispatch_folding.mlir b/compiler/src/iree/compiler/Modules/HAL/Loader/IR/test/dispatch_folding.mlir
index 150cecb..b47948a 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Loader/IR/test/dispatch_folding.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Loader/IR/test/dispatch_folding.mlir

@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --canonicalize -cse %s | iree-opt --split-input-file | FileCheck %s
 
 // CHECK-LABEL: @fold_binding_subspans_into_dispatch
-func.func @fold_binding_subspans_into_dispatch(
+util.func public @fold_binding_subspans_into_dispatch(
     // CHECK-SAME: %[[EXECUTABLE:.+]]: !hal.executable,
     %executable: !hal.executable,
     // CHECK-SAME: %[[BUFFER:.+]]: !util.buffer, %[[SUBSPAN_OFFSET:.+]]: index, %[[SUBSPAN_LENGTH:.+]]: index
@@ -26,5 +26,5 @@
       // CHECK: (%[[BUFFER]] : !util.buffer)[%[[ABSOLUTE_OFFSET]], %[[BINDING_LENGTH]]]
       (%subspan : !util.buffer)[%binding_offset, %binding_length]
     ])
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/HAL/Loader/Transforms/test/materialize_executables.mlir b/compiler/src/iree/compiler/Modules/HAL/Loader/Transforms/test/materialize_executables.mlir
index 5294d5a..cafb262 100644
--- a/compiler/src/iree/compiler/Modules/HAL/Loader/Transforms/test/materialize_executables.mlir
+++ b/compiler/src/iree/compiler/Modules/HAL/Loader/Transforms/test/materialize_executables.mlir

@@ -24,11 +24,11 @@
 }
 
 // CHECK-LABEL: @get_ex0
-func.func private @get_ex0() -> !hal.executable {
+util.func private @get_ex0() -> !hal.executable {
   // CHECK: %[[EX0:.+]] = util.global.load @ex0 : !hal.executable
   %ex0 = hal_loader.executable.lookup executable(@ex0) : !hal.executable
   // CHECK: return %[[EX0]]
-  return %ex0 : !hal.executable
+  util.return %ex0 : !hal.executable
 }
 
 // CHECK: util.global private @ex1 : !hal.executable
@@ -40,9 +40,9 @@
 }
 
 // CHECK-LABEL: @get_ex1
-func.func private @get_ex1() -> !hal.executable {
+util.func private @get_ex1() -> !hal.executable {
   // CHECK: %[[EX1:.+]] = util.global.load @ex1 : !hal.executable
   %ex1 = hal_loader.executable.lookup executable(@ex1) : !hal.executable
   // CHECK: return %[[EX1]]
-  return %ex1 : !hal.executable
+  util.return %ex1 : !hal.executable
 }

diff --git a/compiler/src/iree/compiler/Modules/IO/Parameters/Conversion/ParamsToVM/test/parameter_ops.mlir b/compiler/src/iree/compiler/Modules/IO/Parameters/Conversion/ParamsToVM/test/parameter_ops.mlir
index 54ddd68..a570f9f 100644
--- a/compiler/src/iree/compiler/Modules/IO/Parameters/Conversion/ParamsToVM/test/parameter_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/IO/Parameters/Conversion/ParamsToVM/test/parameter_ops.mlir

@@ -3,7 +3,7 @@
 
 // CHECK-LABEL: @parameterLoad
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[QUEUE_AFFINITY:.+]]: i64, %[[WAIT:.+]]: !vm.ref<!hal.fence>, %[[SIGNAL:.+]]: !vm.ref<!hal.fence>)
-func.func @parameterLoad(%device: !hal.device, %queue_affinity: i64, %wait: !hal.fence, %signal: !hal.fence) -> (!hal.buffer, !hal.buffer) {
+util.func public @parameterLoad(%device: !hal.device, %queue_affinity: i64, %wait: !hal.fence, %signal: !hal.fence) -> (!hal.buffer, !hal.buffer) {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c100 = arith.constant 100 : index
@@ -23,14 +23,14 @@
   // CHECK-DAG: %[[C1:.+]] = vm.const.i32 1
   // CHECK-DAG: %[[TARGET_BUFFER1:.+]] = vm.list.get.ref %[[TARGET_BUFFERS]], %[[C1]]
   // CHECK: return %[[TARGET_BUFFER0]], %[[TARGET_BUFFER1]]
-  return %target_buffers#0, %target_buffers#1 : !hal.buffer, !hal.buffer
+  util.return %target_buffers#0, %target_buffers#1 : !hal.buffer, !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @parameterLoadNoScope
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[QUEUE_AFFINITY:.+]]: i64, %[[WAIT:.+]]: !vm.ref<!hal.fence>, %[[SIGNAL:.+]]: !vm.ref<!hal.fence>)
-func.func @parameterLoadNoScope(%device: !hal.device, %queue_affinity: i64, %wait: !hal.fence, %signal: !hal.fence) -> !hal.buffer {
+util.func public @parameterLoadNoScope(%device: !hal.device, %queue_affinity: i64, %wait: !hal.fence, %signal: !hal.fence) -> !hal.buffer {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   //      CHECK: %[[KEY_TABLE:.+]], %[[KEY_DATA:.+]] = vm.rodata.table.inline i32 : !vm.buffer, !vm.buffer = ["key"]
@@ -45,14 +45,14 @@
   // CHECK-DAG: %[[C0:.+]] = vm.const.i32 0
   // CHECK-DAG: %[[TARGET_BUFFER:.+]] = vm.list.get.ref %[[TARGET_BUFFERS]], %[[C0]]
   // CHECK: return %[[TARGET_BUFFER]]
-  return %target_buffer : !hal.buffer
+  util.return %target_buffer : !hal.buffer
 }
 
 // -----
 
 // CHECK-LABEL: @parameterGather
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[QUEUE_AFFINITY:.+]]: i64, %[[WAIT:.+]]: !vm.ref<!hal.fence>, %[[SIGNAL:.+]]: !vm.ref<!hal.fence>, %[[TARGET_BUFFER:.+]]: !vm.ref<!hal.buffer>)
-func.func @parameterGather(%device: !hal.device, %queue_affinity: i64, %wait: !hal.fence, %signal: !hal.fence, %target_buffer: !hal.buffer) {
+util.func public @parameterGather(%device: !hal.device, %queue_affinity: i64, %wait: !hal.fence, %signal: !hal.fence, %target_buffer: !hal.buffer) {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c52_i64 = arith.constant 52 : i64
@@ -73,14 +73,14 @@
     "scope"::"key1"[%c51_i64] -> %target_buffer[%c101 for %c201] : !hal.buffer,
     "scope"::"key2"[%c52_i64] -> %target_buffer[%c102 for %c202] : !hal.buffer
   }
-  return
+  util.return
 }
 
 // -----
 
 // CHECK-LABEL: @parameterScatter
 // CHECK-SAME: (%[[DEVICE:.+]]: !vm.ref<!hal.device>, %[[QUEUE_AFFINITY:.+]]: i64, %[[WAIT:.+]]: !vm.ref<!hal.fence>, %[[SIGNAL:.+]]: !vm.ref<!hal.fence>, %[[SOURCE_BUFFER:.+]]: !vm.ref<!hal.buffer>)
-func.func @parameterScatter(%device: !hal.device, %queue_affinity: i64, %wait: !hal.fence, %signal: !hal.fence, %source_buffer: !hal.buffer) {
+util.func public @parameterScatter(%device: !hal.device, %queue_affinity: i64, %wait: !hal.fence, %signal: !hal.fence, %source_buffer: !hal.buffer) {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c52_i64 = arith.constant 52 : i64
@@ -101,5 +101,5 @@
     %source_buffer[%c101 for %c201] : !hal.buffer -> "scope"::"key1"[%c51_i64],
     %source_buffer[%c102 for %c202] : !hal.buffer -> "scope"::"key2"[%c52_i64]
   }
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/IO/Parameters/Conversion/StreamToParams/test/parameter_ops.mlir b/compiler/src/iree/compiler/Modules/IO/Parameters/Conversion/StreamToParams/test/parameter_ops.mlir
index 92035c3..4842e51 100644
--- a/compiler/src/iree/compiler/Modules/IO/Parameters/Conversion/StreamToParams/test/parameter_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/IO/Parameters/Conversion/StreamToParams/test/parameter_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @parameterLoad
 // CHECK-SAME: (%[[WAIT:.+]]: !hal.fence) -> (!hal.buffer, !hal.buffer, !hal.fence)
-func.func @parameterLoad(%wait: !stream.timepoint) -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
+util.func public @parameterLoad(%wait: !stream.timepoint) -> (!stream.resource<constant>, !stream.resource<constant>, !stream.timepoint) {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c100 = arith.constant 100 : index
@@ -20,14 +20,14 @@
     "scope"::"key1"[%c51_i64] : !stream.resource<constant>{%c101}
   } => !stream.timepoint
   // CHECK: return %[[BUFFERS]]#0, %[[BUFFERS]]#1, %[[SIGNAL]]
-  return %results#0, %results#1, %result_timepoint : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
+  util.return %results#0, %results#1, %result_timepoint : !stream.resource<constant>, !stream.resource<constant>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterLoadNoScope
 // CHECK-SAME: (%[[WAIT:.+]]: !hal.fence) -> (!hal.buffer, !hal.fence)
-func.func @parameterLoadNoScope(%wait: !stream.timepoint) -> (!stream.resource<constant>, !stream.timepoint) {
+util.func public @parameterLoadNoScope(%wait: !stream.timepoint) -> (!stream.resource<constant>, !stream.timepoint) {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   // CHECK-DAG: %[[DEVICE:.+]] = hal.devices.get %{{.+}}
@@ -41,14 +41,14 @@
     "key"[%c50_i64] : !stream.resource<constant>{%c100}
   } => !stream.timepoint
   // CHECK: return %[[BUFFER]], %[[SIGNAL]]
-  return %result, %result_timepoint : !stream.resource<constant>, !stream.timepoint
+  util.return %result, %result_timepoint : !stream.resource<constant>, !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterRead
 // CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[TARGET:.+]]: !hal.buffer) -> !hal.fence
-func.func @parameterRead(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
+util.func public @parameterRead(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -61,14 +61,14 @@
   // CHECK-NEXT:   "scope"::"key"[%c50_i64] -> %[[TARGET]][%c100 for %c200] : !hal.buffer
   %timepoint = stream.parameter.read await(%wait) => "scope"::"key"[%c50_i64] -> %target[%c100 for %c200] : !stream.resource<transient>{%c300} => !stream.timepoint
   // CHECK: return %[[SIGNAL]]
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterWrite
 // CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[SOURCE:.+]]: !hal.buffer) -> !hal.fence
-func.func @parameterWrite(%wait: !stream.timepoint, %source: !stream.resource<transient>) -> !stream.timepoint {
+util.func public @parameterWrite(%wait: !stream.timepoint, %source: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c100 = arith.constant 100 : index
   %c200 = arith.constant 200 : index
@@ -81,14 +81,14 @@
   // CHECK-NEXT:   %[[SOURCE]][%c100 for %c200] : !hal.buffer -> "scope"::"key"[%c50_i64]
   %timepoint = stream.parameter.write await(%wait) => %source[%c100 for %c200] : !stream.resource<transient>{%c300} -> "scope"::"key"[%c50_i64] => !stream.timepoint
   // CHECK: return %[[SIGNAL]]
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterGather
 // CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[TARGET:.+]]: !hal.buffer) -> !hal.fence
-func.func @parameterGather(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
+util.func public @parameterGather(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c52_i64 = arith.constant 52 : i64
@@ -113,14 +113,14 @@
     "scope"::"key2"[%c52_i64] -> %target[%c102 for %c202] : !stream.resource<transient>{%c300}
   } => !stream.timepoint
   // CHECK: return %[[SIGNAL]]
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterGatherNoScope
 // CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[TARGET:.+]]: !hal.buffer) -> !hal.fence
-func.func @parameterGatherNoScope(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
+util.func public @parameterGatherNoScope(%wait: !stream.timepoint, %target: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c100 = arith.constant 100 : index
@@ -140,14 +140,14 @@
     "key1"[%c51_i64] -> %target[%c101 for %c201] : !stream.resource<transient>{%c300}
   } => !stream.timepoint
   // CHECK: return %[[SIGNAL]]
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }
 
 // -----
 
 // CHECK-LABEL: @parameterScatter
 // CHECK-SAME: (%[[WAIT:.+]]: !hal.fence, %[[SOURCE:.+]]: !hal.buffer) -> !hal.fence
-func.func @parameterScatter(%wait: !stream.timepoint, %source: !stream.resource<transient>) -> !stream.timepoint {
+util.func public @parameterScatter(%wait: !stream.timepoint, %source: !stream.resource<transient>) -> !stream.timepoint {
   %c50_i64 = arith.constant 50 : i64
   %c51_i64 = arith.constant 51 : i64
   %c52_i64 = arith.constant 52 : i64
@@ -173,5 +173,5 @@
     %source[%c102 for %c202] : !stream.resource<transient>{%c300} -> "scope"::"key2"[%c52_i64]
   } => !stream.timepoint
   // CHECK: return %[[SIGNAL]]
-  return %timepoint : !stream.timepoint
+  util.return %timepoint : !stream.timepoint
 }

diff --git a/compiler/src/iree/compiler/Modules/IO/Parameters/IR/test/parameter_ops.mlir b/compiler/src/iree/compiler/Modules/IO/Parameters/IR/test/parameter_ops.mlir
index e60c73e..bf3f808 100644
--- a/compiler/src/iree/compiler/Modules/IO/Parameters/IR/test/parameter_ops.mlir
+++ b/compiler/src/iree/compiler/Modules/IO/Parameters/IR/test/parameter_ops.mlir

@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @parameterLoad
 // CHECK-SAME: (%[[DEVICE:.+]]: !hal.device, %[[WAIT:.+]]: !hal.fence, %[[SIGNAL:.+]]: !hal.fence)
-func.func @parameterLoad(%device: !hal.device, %wait: !hal.fence, %signal: !hal.fence) {
+util.func public @parameterLoad(%device: !hal.device, %wait: !hal.fence, %signal: !hal.fence) {
   // CHECK-DAG: %[[AFFINITY:.+]] = arith.constant -1
   %affinity = arith.constant -1 : i64
   // CHECK-DAG: %[[OFFSET:.+]] = arith.constant 0
@@ -24,5 +24,5 @@
       usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage|SharingImmutable") {
         "scope"::"w0"[%offset] : !hal.buffer{%length}
       }
-  return
+  util.return
 }

diff --git a/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/Passes.td b/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/Passes.td
index 61b2247..36d374f 100644
--- a/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/Passes.td

@@ -33,7 +33,6 @@
 def GenerateSplatParameterArchivePass :
     Pass<"iree-io-generate-splat-parameter-archive", "mlir::ModuleOp"> {
   let summary = "Generates a .irpa file with splat entries for all parameters";
-  let dependentDialects = [];
   let options = [
     Option<"archivePath", "archive-path", "std::string",
            /*default=*/"",

diff --git a/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/test/export_parameters.mlir b/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/test/export_parameters.mlir
index 1a474d0..5354708 100644
--- a/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/test/export_parameters.mlir
+++ b/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/test/export_parameters.mlir

@@ -9,7 +9,7 @@
   util.global private @array_global_0 = dense<[[11.0, 12.0]]> : tensor<1x2xf32>
   util.global private @dense_global_1 = dense<"0x0000E040000000410000104100002041"> : tensor<2x2xf32>
   util.global private @dense_global_2 = dense<"0x0000803F000000400000404000008040"> : tensor<2x2xf32>
-  func.func @parameter_example(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> {
+  util.func public @parameter_example(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> {
     %cst = arith.constant 0.000000e+00 : f32
     %3 = util.global.load @array_global_0 : tensor<1x2xf32>
     %4 = util.global.load @dense_global_1 : tensor<2x2xf32>
@@ -21,6 +21,6 @@
     %10 = linalg.add ins(%8, %5 : tensor<1x2xf32>, tensor<1x2xf32>) outs(%empty : tensor<1x2xf32>) -> tensor<1x2xf32>
     %12 = linalg.matmul ins(%10, %4 : tensor<1x2xf32>, tensor<2x2xf32>) outs(%fill : tensor<1x2xf32>) -> tensor<1x2xf32>
     %14 = linalg.add ins(%12, %3 : tensor<1x2xf32>, tensor<1x2xf32>) outs(%empty : tensor<1x2xf32>) -> tensor<1x2xf32>
-    return %14 : tensor<1x2xf32>
+    util.return %14 : tensor<1x2xf32>
   }
 }

diff --git a/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/test/generate_splat_parameter_archive.mlir b/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/test/generate_splat_parameter_archive.mlir
index c99d390..b77dfa8 100644
--- a/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/test/generate_splat_parameter_archive.mlir
+++ b/compiler/src/iree/compiler/Modules/IO/Parameters/Transforms/test/generate_splat_parameter_archive.mlir

@@ -11,7 +11,7 @@
   util.global private @dense_global_1 = #stream.parameter.named<"model"::"global_1"> : tensor<2x2xi32>
   util.global private @dense_global_2 = #stream.parameter.named<"model"::"global_2"> : tensor<1x2xi32>
   util.global private @dense_global_3 = #stream.parameter.named<"model"::"global_3"> : tensor<2x2xi32>
-  func.func @forward(%arg0: tensor<1x2xi32>) -> tensor<1x2xi32> {
+  util.func public @forward(%arg0: tensor<1x2xi32>) -> tensor<1x2xi32> {
     %cst = arith.constant 0 : i32
     %3 = util.global.load @array_global_0 : tensor<1x2xi32>
     %4 = util.global.load @dense_global_1 : tensor<2x2xi32>
@@ -23,7 +23,7 @@
     %10 = linalg.add ins(%8, %5 : tensor<1x2xi32>, tensor<1x2xi32>) outs(%empty : tensor<1x2xi32>) -> tensor<1x2xi32>
     %12 = linalg.matmul ins(%10, %4 : tensor<1x2xi32>, tensor<2x2xi32>) outs(%fill : tensor<1x2xi32>) -> tensor<1x2xi32>
     %14 = linalg.add ins(%12, %3 : tensor<1x2xi32>, tensor<1x2xi32>) outs(%empty : tensor<1x2xi32>) -> tensor<1x2xi32>
-    return %14 : tensor<1x2xi32>
+    util.return %14 : tensor<1x2xi32>
   }
 }
 

diff --git a/compiler/src/iree/compiler/Preprocessing/Common/test/external_function_spec.mlir b/compiler/src/iree/compiler/Preprocessing/Common/test/external_function_spec.mlir
index 579ea29..5152aca 100644
--- a/compiler/src/iree/compiler/Preprocessing/Common/test/external_function_spec.mlir
+++ b/compiler/src/iree/compiler/Preprocessing/Common/test/external_function_spec.mlir

@@ -1,18 +1,18 @@
 // Test for importing functions from this spec to a payload module.
 // Tested in `transform_symbol_importing.mlir`
 module attributes {transform.with_named_sequence} {
-  func.func private @some_external_function(%arg0: tensor<?xf32>) -> tensor<?xf32>
+  util.func private @some_external_function(%arg0: tensor<?xf32>) -> tensor<?xf32>
 
-  func.func @some_function(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-    return %arg0 : tensor<?xf32>
+  util.func @some_function(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+    util.return %arg0 : tensor<?xf32>
   }
 
   transform.named_sequence @__transform_main(%module: !transform.any_op) {
-    %new_func = transform.iree.import_symbol @some_function into %module : (!transform.any_op) -> !transform.any_op
+    %new_func = transform.util.import_symbol @some_function into %module : (!transform.any_op) -> !transform.any_op
 
-    %func = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op   
-    %module_2 = transform.iree.get_nearest_symbol_table %func : (!transform.any_op) -> !transform.any_op
-    %new_func_2 = transform.iree.import_symbol @some_external_function into %module_2 : (!transform.any_op) -> !transform.any_op
+    %func = transform.structured.match ops{["util.func"]} in %module : (!transform.any_op) -> !transform.any_op
+    %module_2 = transform.util.get_nearest_symbol_table %func : (!transform.any_op) -> !transform.any_op
+    %new_func_2 = transform.util.import_symbol @some_external_function into %module_2 : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }

diff --git a/compiler/src/iree/compiler/Preprocessing/Common/test/transform_symbol_importing.mlir b/compiler/src/iree/compiler/Preprocessing/Common/test/transform_symbol_importing.mlir
index 77a6f5b..d77a4e3 100644
--- a/compiler/src/iree/compiler/Preprocessing/Common/test/transform_symbol_importing.mlir
+++ b/compiler/src/iree/compiler/Preprocessing/Common/test/transform_symbol_importing.mlir

@@ -5,6 +5,6 @@
 }
 
 // CHECK-LABEL: module @example
-//       CHECK:   func.func private @some_external_function(tensor<?xf32>) -> tensor<?xf32>
-//       CHECK:   func.func @some_function(%arg0: tensor<?xf32>) -> tensor<?xf32>
-//  CHECK-NEXT:     return %arg0 : tensor<?xf32>
+//       CHECK:   util.func private @some_external_function(%arg0: tensor<?xf32>) -> tensor<?xf32>
+//       CHECK:   util.func public @some_function(%arg0: tensor<?xf32>) -> tensor<?xf32>
+//  CHECK-NEXT:     util.return %arg0 : tensor<?xf32>

diff --git a/compiler/src/iree/compiler/Preprocessing/TransformExtensions/PreprocessingExtensions.cpp b/compiler/src/iree/compiler/Preprocessing/TransformExtensions/PreprocessingExtensions.cpp
index 71b03e9..23087ef 100644
--- a/compiler/src/iree/compiler/Preprocessing/TransformExtensions/PreprocessingExtensions.cpp
+++ b/compiler/src/iree/compiler/Preprocessing/TransformExtensions/PreprocessingExtensions.cpp

@@ -30,98 +30,6 @@
 }
 
 //===----------------------------------------------------------------------===//
-// GetNearestSymbolTableOp
-//===----------------------------------------------------------------------===//
-
-DiagnosedSilenceableFailure
-IREE::transform_dialect::GetNearestSymbolTableOp::applyToOne(
-    transform::TransformRewriter &rewriter, Operation *target,
-    transform::ApplyToEachResultList &results,
-    transform::TransformState &state) {
-  auto tableOp = SymbolTable::getNearestSymbolTable(target);
-  if (!tableOp) {
-    return emitDefaultDefiniteFailure(target);
-  }
-  results.push_back(tableOp);
-  return DiagnosedSilenceableFailure::success();
-}
-
-void IREE::transform_dialect::GetNearestSymbolTableOp::getEffects(
-    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  transform::onlyReadsHandle(getTarget(), effects);
-  transform::producesHandle(getResult(), effects);
-  transform::modifiesPayload(effects);
-}
-
-//===----------------------------------------------------------------------===//
-// ImportSymbolOp
-//===----------------------------------------------------------------------===//
-
-DiagnosedSilenceableFailure IREE::transform_dialect::ImportSymbolOp::apply(
-    transform::TransformRewriter &rewriter,
-    transform::TransformResults &transformResults,
-    transform::TransformState &state) {
-  auto symbolOp = SymbolTable::lookupNearestSymbolFrom(*this, getSymbol());
-  if (!symbolOp) {
-    return emitDefiniteFailure() << "could not find corresponding symbol op";
-  }
-  // Require isolated from above as the clone does not make sense with escaping
-  // values.
-  if (!symbolOp->hasTrait<OpTrait::IsIsolatedFromAbove>()) {
-    return emitDefiniteFailure()
-           << "target symbol op is not isolated from above";
-  }
-  StringRef symbol = getSymbol().getLeafReference();
-  SmallVector<Operation *> results;
-  for (Operation *payloadOp : state.getPayloadOps(getSymbolTable())) {
-    if (!payloadOp->hasTrait<OpTrait::SymbolTable>()) {
-      return emitDefiniteFailure()
-             << "target symbol table " << payloadOp << " is not a symbol table";
-    }
-    SymbolTable symbolTable(payloadOp);
-
-    if (Operation *preExistingSymbolOp = symbolTable.lookup(symbol)) {
-      if (getForceImport()) {
-        // If we want to overwrite pre-existing symbols, just erase it here.
-        symbolTable.erase(preExistingSymbolOp);
-      } else if (getIfUndefined()) {
-        // Skip if we want to use the symbol that is already there.
-        results.push_back(preExistingSymbolOp);
-        continue;
-      } else {
-        return emitDefiniteFailure()
-               << "target symbol " << symbol << " is already defined";
-      }
-    }
-
-    // Symbol table ops must have exactly one region with exactly one block.
-    // Simply clone the target symbol op into the single block.
-    rewriter.setInsertionPointToStart(&payloadOp->getRegion(0).front());
-    results.push_back(rewriter.clone(*symbolOp));
-  }
-  transformResults.set(cast<OpResult>(getClonedSymbol()), results);
-  return DiagnosedSilenceableFailure::success();
-}
-
-void IREE::transform_dialect::ImportSymbolOp::getEffects(
-    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  transform::onlyReadsHandle(getSymbolTable(), effects);
-  transform::producesHandle(getClonedSymbol(), effects);
-  transform::modifiesPayload(effects);
-}
-
-LogicalResult IREE::transform_dialect::ImportSymbolOp::verify() {
-  if (getForceImport() && getIfUndefined()) {
-    return emitOpError()
-           << "force_import and if_undefined are mutually exclusive";
-  }
-  if (!SymbolTable::lookupNearestSymbolFrom(*this, getSymbol())) {
-    return emitOpError() << "invalid import of undefined symbol";
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
 // MatchCastCompatibleDagFromRootOp
 //===----------------------------------------------------------------------===//
 

diff --git a/compiler/src/iree/compiler/Preprocessing/TransformExtensions/PreprocessingExtensionsOps.td b/compiler/src/iree/compiler/Preprocessing/TransformExtensions/PreprocessingExtensionsOps.td
index de0232f..af7eba0 100644
--- a/compiler/src/iree/compiler/Preprocessing/TransformExtensions/PreprocessingExtensionsOps.td
+++ b/compiler/src/iree/compiler/Preprocessing/TransformExtensions/PreprocessingExtensionsOps.td

@@ -15,74 +15,6 @@
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 
-def GetNearestSymbolTableOp : Op<Transform_Dialect, "iree.get_nearest_symbol_table",
-    [FunctionalStyleTransformOpTrait,
-     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-     TransformOpInterface,
-     TransformEachOpTrait,
-     ReportTrackingListenerFailuresOpTrait]> {
-  let description = [{
-    Returns the nearest symbol table op for each op in the payload, inclusive.
-
-    #### Return modes
-
-    This operation reads the `target` handle and produces the `result`
-    handle. This operation emits a definite failure if the nearest symbol table
-    is unknown.
-  }];
-
-  let arguments = (ins TransformHandleTypeInterface:$target);
-  let results = (outs TransformHandleTypeInterface:$result);
-
-  let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
-  let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
-  let extraClassDeclaration = [{
-    ::mlir::DiagnosedSilenceableFailure applyToOne(
-        ::mlir::transform::TransformRewriter &rewriter,
-        ::mlir::Operation* target,
-        ::mlir::transform::ApplyToEachResultList &results,
-        ::mlir::transform::TransformState &state);
-  }];
-}
-
-def ImportSymbolOp : Op<Transform_Dialect, "iree.import_symbol",
-    [FunctionalStyleTransformOpTrait,
-     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-     DeclareOpInterfaceMethods<TransformOpInterface>,
-     ReportTrackingListenerFailuresOpTrait]> {
-  let description = [{
-    Clones the op defined by the given symbol into the given symbol table and
-    returns the cloned symbol. If `force_import` is set, this will (unsafely)
-    overwrite any pre-existing definitions of the same symbol. If
-    `if_undefined` is set, this will return a handle to the pre-existing symbol
-    in the payload if found instead of failing.
-
-    #### Return modes
-
-    This operation reads the `symbol_table` handle and produces the
-    `cloned_symbol` handle. This operation emits a definite failure if the if
-    the `symbol_table` op does not define a symbol table.
-
-    This will emit a definite failure if the symbol already exists in the
-    symbol table and neither `force_import` and `if_undefined` are set.
-  }];
-
-  let arguments = (ins SymbolRefAttr:$symbol,
-                       UnitAttr:$if_undefined,
-                       UnitAttr:$force_import,
-                       TransformHandleTypeInterface:$symbol_table);
-  let results = (outs TransformHandleTypeInterface:$cloned_symbol);
-
-  let assemblyFormat = [{
-    (`force` $force_import^)? $symbol `into` $symbol_table
-    (`if` `undefined` $if_undefined^)? attr-dict 
-    `:` functional-type(operands, results)
-  }];
-  let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
-
-  let hasVerifier = 1;
-}
-
 def MatchCastCompatibleDagFromRootOp : Op<Transform_Dialect, "iree.match.cast_compatible_dag_from_root",
     [IsolatedFromAbove,
      MatchOpInterface,

diff --git a/samples/custom_dispatch/cpu/embedded/example_transform_spec.mlir b/samples/custom_dispatch/cpu/embedded/example_transform_spec.mlir
index 1867e5e..c709e20 100644
--- a/samples/custom_dispatch/cpu/embedded/example_transform_spec.mlir
+++ b/samples/custom_dispatch/cpu/embedded/example_transform_spec.mlir

@@ -68,7 +68,7 @@
     }  // hal.executable.variant
   }  // hal.executable
 
-  func.func @call_mul_abs_negate(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  util.func private @call_mul_abs_negate(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     %c0 = arith.constant 0 : index
     %dim = tensor.dim %arg0, %c0 : tensor<?xf32>
     %dim_i32 = arith.index_cast %dim : index to i32
@@ -79,12 +79,10 @@
         #hal.interface.binding<0, 0>,
         #hal.interface.binding<0, 1>,
         #hal.interface.binding<0, 2>
-      ],
-      // HACK: keep the executable live through DCE. Only required when
-      // using the automatic variant selection.
-      hal.executable.ref = [@executable]
+      ]
     } : (i32, tensor<?xf32>{%dim}, tensor<?xf32>{%dim}) -> tensor<?xf32>{%dim}
-    return %0 : tensor<?xf32>
+
+    util.return %0 : tensor<?xf32>
   }
 
   transform.named_sequence @match_mul_abs_negate(%root: !transform.any_op {transform.readonly}) -> (!transform.any_value, !transform.any_value) {
@@ -137,10 +135,10 @@
   transform.named_sequence @cast_and_call_dag(%ins: !transform.any_value {transform.readonly},
                                               %out: !transform.any_value {transform.readonly}) {
     %root = transform.get_defining_op %out : (!transform.any_value) -> !transform.any_op
-    %module = transform.iree.get_nearest_symbol_table %root : (!transform.any_op) -> !transform.any_op
-    %executable = transform.iree.import_symbol @executable into %module if undefined : (!transform.any_op) -> !transform.any_op
-    %func = transform.iree.import_symbol @call_mul_abs_negate into %module if undefined : (!transform.any_op) -> !transform.any_op
-    transform.func.cast_and_call %func(%ins) -> %out after %root {
+    %module = transform.util.get_nearest_symbol_table %root : (!transform.any_op) -> !transform.any_op
+    %executable = transform.util.import_symbol @executable into %module if undefined : (!transform.any_op) -> !transform.any_op
+    %func = transform.util.import_symbol @call_mul_abs_negate into %module if undefined : (!transform.any_op) -> !transform.any_op
+    transform.util.cast_and_call %func(%ins) -> %out after %root {
           // This specifies how to resolve type mismatches between the arguments
           // of the function and the inputs from the matcher. In this example,
           // the only casts this will generate are same-rank tensor casts that
@@ -155,7 +153,7 @@
   // add a new symbol to the module's symbol table.
   transform.named_sequence @__transform_main(%module: !transform.any_op) {
     // Gather the set of functions within the module.
-    %funcs = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op   
+    %funcs = transform.structured.match ops{["util.func"]} in %module : (!transform.any_op) -> !transform.any_op
     // For each function in the module, run the matcher on all contained
     // operations.
     transform.foreach %funcs : !transform.any_op {

diff --git a/samples/custom_dispatch/cpu/mlp_plugin/mlp_spec.mlir b/samples/custom_dispatch/cpu/mlp_plugin/mlp_spec.mlir
index 72a9860..eec83f7 100644
--- a/samples/custom_dispatch/cpu/mlp_plugin/mlp_spec.mlir
+++ b/samples/custom_dispatch/cpu/mlp_plugin/mlp_spec.mlir

@@ -1,4 +1,4 @@
-// Sample spec that matches an MLP example and forwards to 
+// Sample spec that matches an MLP example and forwards to
 // an implementation implemented by a system plugin.
 // Is used along with samples/custom_dispatch/cpu/plugin/mlp.mlir
 
@@ -51,7 +51,7 @@
     }
   }
 
-  func.func private @call_mlp(%lhs : tensor<?x?xf32>, %rhs : tensor<?x?xf32>, %init1 : tensor<?x?xf32>, %init2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  util.func private @call_mlp(%lhs : tensor<?x?xf32>, %rhs : tensor<?x?xf32>, %init1 : tensor<?x?xf32>, %init2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %m = tensor.dim %lhs, %c0 : tensor<?x?xf32>
@@ -61,17 +61,15 @@
     %n_i32 = arith.index_cast %n : index to i32
     %k_i32 = arith.index_cast %k : index to i32
 
-    %mlp_result = flow.dispatch @executable::@x86_64::@mlp[](%lhs, %rhs, %m_i32, %n_i32, %k_i32) {
+    %mlp_result = flow.dispatch @executable::@x86_64::@mlp(%lhs, %rhs, %m_i32, %n_i32, %k_i32) {
       hal.interface.bindings = [
         #hal.interface.binding<0, 0>,
         #hal.interface.binding<0, 1>,
         #hal.interface.binding<0, 2>
-      ],
-      // HACK: keep the executable live through DCE. Only required when
-      // using the automatic variant selection.
-      hal.executable.ref = [@executable]
-    } : (tensor<?x?xf32>{%m, %k}, tensor<?x?xf32>{%k, %n}, i32, i32, i32) -> tensor<?x?xf32>{%m, %n}  
-    return %mlp_result : tensor<?x?xf32>    
+      ]
+    } : (tensor<?x?xf32>{%m, %k}, tensor<?x?xf32>{%k, %n}, i32, i32, i32) -> tensor<?x?xf32>{%m, %n}
+
+    util.return %mlp_result : tensor<?x?xf32>
   }
 
   transform.named_sequence @match_mlp(%root: !transform.any_op {transform.readonly}) -> (!transform.any_value, !transform.any_value) {
@@ -104,10 +102,10 @@
   transform.named_sequence @cast_and_call_dag(%ins: !transform.any_value {transform.readonly},
                                               %out: !transform.any_value {transform.readonly}) {
     %root = transform.get_defining_op %out : (!transform.any_value) -> !transform.any_op
-    %module = transform.iree.get_nearest_symbol_table %root : (!transform.any_op) -> !transform.any_op
-    %executable = transform.iree.import_symbol @executable into %module if undefined : (!transform.any_op) -> !transform.any_op
-    %func = transform.iree.import_symbol @call_mlp into %module if undefined : (!transform.any_op) -> !transform.any_op
-    transform.func.cast_and_call %func(%ins) -> %out after %root {
+    %module = transform.util.get_nearest_symbol_table %root : (!transform.any_op) -> !transform.any_op
+    %executable = transform.util.import_symbol @executable into %module if undefined : (!transform.any_op) -> !transform.any_op
+    %func = transform.util.import_symbol @call_mlp into %module if undefined : (!transform.any_op) -> !transform.any_op
+    transform.util.cast_and_call %func(%ins) -> %out after %root {
       // This specifies how to resolve type mismatches between the arguments
       // of the function and the inputs from the matcher. In this example,
       // the only casts this will generate are same-rank tensor casts that
@@ -122,7 +120,7 @@
   // add a new symbol to the module's symbol table.
   transform.named_sequence @__transform_main(%module: !transform.any_op) {
     // Gather the set of functions within the module.
-    %funcs = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op   
+    %funcs = transform.structured.match ops{["util.func"]} in %module : (!transform.any_op) -> !transform.any_op
     // For each function in the module, run the matcher on all contained
     // operations.
     transform.foreach %funcs : !transform.any_op {

diff --git a/samples/custom_dispatch/vulkan/shaders/example_transform_spec.mlir b/samples/custom_dispatch/vulkan/shaders/example_transform_spec.mlir
index 065b795..08b977d 100644
--- a/samples/custom_dispatch/vulkan/shaders/example_transform_spec.mlir
+++ b/samples/custom_dispatch/vulkan/shaders/example_transform_spec.mlir

@@ -32,7 +32,7 @@
       count(%device: !hal.device, %workload: index) -> (index, index, index) {
         %c1_0 = arith.constant 1 : index
         hal.return %c1_0, %c1_0, %c1_0 : index, index, index
-      }   
+      }
       layout(#hal.pipeline.layout<push_constants = 1, sets = [
         <0, bindings = [
             <0, storage_buffer, ReadOnly>,
@@ -40,11 +40,11 @@
         ]>
       ]>)
       bindings([
-        #hal.interface.binding<0, 0>, 
+        #hal.interface.binding<0, 0>,
         #hal.interface.binding<0, 1>
-      ])  
+      ])
       objects({
-        #spirv_target ordinal(0) = [ 
+        #spirv_target ordinal(0) = [
           #hal.executable.object<{
             path = "samples/custom_dispatch/vulkan/shaders/one_workgroup_argmax_subgroup_f32.spv"
           }>
@@ -78,8 +78,8 @@
       transform.match.param.cmpi eq %n_inputs, %c1 : !transform.param<i64>
       %n_outputs = transform.match.structured.num_inits %argmax : (!transform.any_op) -> !transform.param<i64>
       transform.match.param.cmpi eq %n_outputs, %c2 : !transform.param<i64>
-  
-      transform.match.structured.yield %argmax : !transform.any_op 
+
+      transform.match.structured.yield %argmax : !transform.any_op
     }
 
     // Verify the operand shapes of the linalg op. For example, in the below,
@@ -125,8 +125,8 @@
   // custom kernel authored above, and replace the users of the argmax with a
   // call to the function.
   transform.named_sequence @cast_and_call_argmax(%argmax: !transform.any_op {transform.readonly}) {
-    %module = transform.iree.get_nearest_symbol_table %argmax : (!transform.any_op) -> !transform.any_op
-    %func = transform.iree.import_symbol @argmax_1d_f32_entry_point into %module : (!transform.any_op) -> !transform.any_op
+    %module = transform.util.get_nearest_symbol_table %argmax : (!transform.any_op) -> !transform.any_op
+    %func = transform.util.import_symbol @argmax_1d_f32_entry_point into %module : (!transform.any_op) -> !transform.any_op
     %ins = transform.get_operand %argmax[0] : (!transform.any_op) -> !transform.any_value
     %outs = transform.get_result %argmax[1] : (!transform.any_op) -> !transform.any_value
     transform.func.cast_and_call %func(%ins) -> %outs before %argmax {
@@ -144,7 +144,7 @@
   // add a new symbol to the module's symbol table.
   transform.named_sequence @__transform_main(%module: !transform.any_op) {
     // Gather the set of functions within the module.
-    %funcs = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op   
+    %funcs = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op
     // For each function in the module, run the matcher on all contained
     // operations.
     transform.foreach %funcs : !transform.any_op {

diff --git a/tests/compiler_driver/inline_dynamic_hal_executable.mlir b/tests/compiler_driver/inline_dynamic_hal_executable.mlir
index 0fcf0e9..9c50d03 100644
--- a/tests/compiler_driver/inline_dynamic_hal_executable.mlir
+++ b/tests/compiler_driver/inline_dynamic_hal_executable.mlir

@@ -1,6 +1,5 @@
 // RUN: iree-compile \
 // RUN:   --compile-to=hal \
-// RUN:   --mlir-print-ir-after-all \
 // RUN:   --iree-execution-model=inline-dynamic \
 // RUN:   --iree-hal-target-backends=vmvx %s | FileCheck %s
 
@@ -9,7 +8,8 @@
   return %0, %arg0 : tensor<4xf32>, tensor<4xf32>
 }
 
-// Check the IR not registered as iree_hal_module_register_loader_types
+// Check that the IR isn't using types from the full HAL
+// (only those in iree_hal_module_register_loader_types).
 // CHECK-NOT: hal.command_buffer
 // CHECK-NOT: hal.allocator
 // CHECK-NOT: hal.event

diff --git a/tests/compiler_driver/inline_static_hal_executable.mlir b/tests/compiler_driver/inline_static_hal_executable.mlir
index 55f0c59..2ac89e9 100644
--- a/tests/compiler_driver/inline_static_hal_executable.mlir
+++ b/tests/compiler_driver/inline_static_hal_executable.mlir

@@ -1,6 +1,5 @@
 // RUN: iree-compile \
 // RUN:   --compile-to=hal \
-// RUN:   --mlir-print-ir-after-all \
 // RUN:   --iree-execution-model=inline-static \
 // RUN:   --iree-hal-target-backends=vmvx-inline %s | FileCheck %s
 
@@ -9,7 +8,8 @@
   return %0, %arg0 : tensor<4xf32>, tensor<4xf32>
 }
 
-// Check the IR not registered as iree_hal_module_register_inline_types
+// Check that the IR isn't using types from the full HAL
+// (only those in iree_hal_module_register_inline_types).
 // CHECK-NOT: hal.command_buffer
 // CHECK-NOT: hal.allocator
 // CHECK-NOT: hal.event

diff --git a/tests/compiler_driver/preprocessing_flags.mlir b/tests/compiler_driver/preprocessing_flags.mlir
index 8cd049f..6c4ad99 100644
--- a/tests/compiler_driver/preprocessing_flags.mlir
+++ b/tests/compiler_driver/preprocessing_flags.mlir

@@ -1,5 +1,5 @@
 // RUN: iree-compile --iree-hal-target-backends=llvm-cpu --compile-to=preprocessing \
-// RUN:   --iree-preprocessing-pass-pipeline="builtin.module(func.func(iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16}))" \
+// RUN:   --iree-preprocessing-pass-pipeline="builtin.module(util.func(iree-preprocessing-convert-conv2d-to-img2col,iree-preprocessing-pad-linalg-ops{pad-size=16}))" \
 // RUN:   --mlir-print-ir-after=iree-preprocessing-convert-conv2d-to-img2col --mlir-print-ir-after=iree-preprocessing-pad-linalg-ops %s 2>&1 \
 // RUN:   | FileCheck %s
 
@@ -8,11 +8,12 @@
       outs(%arg2 : tensor<10x30xf32>) -> tensor<10x30xf32>
   return %0 : tensor<10x30xf32>
 }
+
 // Just check that the pass runs, and that the compilation finishes
 //       CHECK: ConvertConv2DToImg2Col (iree-preprocessing-convert-conv2d-to-img2col)
 //       CHECK: PadLinalgOps (iree-preprocessing-pad-linalg-ops)
 // CHECK-LABEL: module
-//  CHECK-NEXT:   func.func @test(
+//  CHECK-NEXT:   util.func public @test(
 //   CHECK-DAG:     %[[ARG0:.+]] = hal.tensor.import %{{[a-zA-Z0-9]+}} "input0" : !hal.buffer_view -> tensor<10x20xf32>
 //   CHECK-DAG:     %[[ARG1:.+]] = hal.tensor.import %{{[a-zA-Z0-9]+}} "input1" : !hal.buffer_view -> tensor<20x30xf32>
 //   CHECK-DAG:     %[[ARG2:.+]] = hal.tensor.import %{{[a-zA-Z0-9]+}} "input2" : !hal.buffer_view -> tensor<10x30xf32>

diff --git a/tests/transform_dialect/cpu/BUILD.bazel b/tests/transform_dialect/cpu/BUILD.bazel
index 5249a7c..aabea9d 100644
--- a/tests/transform_dialect/cpu/BUILD.bazel
+++ b/tests/transform_dialect/cpu/BUILD.bazel

@@ -17,9 +17,11 @@
         "attention.mlir",
         "contraction-packing.mlir",
         "contraction-packing-and-dispatch.mlir",
-        "eltwise_reduction_eltwise.mlir",
+        # DISABLED: incorrectly assuming default flag values.
+        # "eltwise_reduction_eltwise.mlir",
         "fold_tensor_slice_into_transfer.mlir",
-        "matmul.mlir",
+        # DISABLED: incorrectly assuming default flag values.
+        # "matmul.mlir",
         "matmul_library_call.mlir",
     ],
     cfg = "//tests:lit.cfg.py",

diff --git a/tests/transform_dialect/cpu/CMakeLists.txt b/tests/transform_dialect/cpu/CMakeLists.txt
index 7ddd39f..4328bf4 100644
--- a/tests/transform_dialect/cpu/CMakeLists.txt
+++ b/tests/transform_dialect/cpu/CMakeLists.txt

@@ -17,9 +17,7 @@
     "attention.mlir"
     "contraction-packing-and-dispatch.mlir"
     "contraction-packing.mlir"
-    "eltwise_reduction_eltwise.mlir"
     "fold_tensor_slice_into_transfer.mlir"
-    "matmul.mlir"
     "matmul_library_call.mlir"
   TOOLS
     ${IREE_LLD_TARGET}

diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
index 622db6a..bb0c56a 100644
--- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir

@@ -2,11 +2,9 @@
 // Preprocessing with generalized packing.
 //
 // RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \
-// RUN: iree-opt --iree-hal-target-backends=llvm-cpu \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
+// RUN: iree-compile - --iree-hal-target-backends=llvm-cpu \
+// RUN:     --iree-opt-data-tiling=false \
+// RUN:     --compile-to=executable-configurations | \
 // RUN: FileCheck %s
 
 !a_tensor_t = tensor<1234x567xf32>
@@ -16,10 +14,6 @@
 // Note: the normalization in these maps is gone due to InterchangeGenericOps.
 // When using generalized packing, it would be better to drop that pass.
 
-// CHECK-DAG: #[[$map_lhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>
-// CHECK-DAG: #[[$map_rhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d1, d3, d5)>
-// CHECK-DAG: #[[$map_res:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
-
 // CHECK-LABEL: func.func @matmul_dispatch_0
 //       CHECK:   tensor.empty() : tensor<155x18x8x32xf32>
 //       CHECK:   tensor.pack
@@ -33,11 +27,12 @@
 //       CHECK:   tensor.pack
 
 // CHECK-LABEL: func.func @matmul_dispatch_3
-func.func @matmul(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t {
+func.func public @matmul(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t {
   %rhs = arith.constant dense<0.1> : !b_tensor_t
   %c0 = util.optimization_barrier %rhs : !b_tensor_t
   //  CHECK-NOT: pack
-  //      CHECK: linalg.generic {indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_res]]],
+  //      CHECK: linalg.generic
+  // CHECK-SAME:   indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]
   // CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]}
   // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>)
   // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)

diff --git a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir
index 60a1738..0e61ca7 100644
--- a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir
+++ b/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir

@@ -43,19 +43,17 @@
   return %8 : !out_tensor_t
 }
 
-
-// RUN: iree-opt %s --iree-hal-target-backends=llvm-cpu \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmcpu-select-lowering-strategy, iree-llvmcpu-lower-executable-target)))' \
+// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
+// RUN:     --iree-opt-data-tiling=false \
+// RUN:     --compile-to=executable-configurations | \
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs,iree-llvmcpu-select-lowering-strategy,iree-llvmcpu-lower-executable-target)))' \
 // RUN:     --iree-llvmcpu-enable-transform-dialect-jit | \
 // RUN: FileCheck %s
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu  \
+// RUN:     --iree-opt-data-tiling=false \
 // RUN:     --iree-llvmcpu-enable-transform-dialect-jit | \
-// RUN: iree-run-module --module=- --function=reduce --device=local-task --input="32x256xf32=1" |\
+// RUN: iree-run-module --module=- --function=reduce --device=local-task --input="32x256xf32=1" | \
 // RUN: FileCheck %s --check-prefix=EXEC
 
 //      CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index

diff --git a/tests/transform_dialect/cpu/matmul.mlir b/tests/transform_dialect/cpu/matmul.mlir
index 1af272f..246b712 100644
--- a/tests/transform_dialect/cpu/matmul.mlir
+++ b/tests/transform_dialect/cpu/matmul.mlir

@@ -1,4 +1,3 @@
-
 !A_size = tensor<3x5xf32>
 !B_size = tensor<5x3xf32>
 !C_size = tensor<3x3xf32>
@@ -10,19 +9,17 @@
   return %0 : !C_size
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=llvm-cpu \
-// RUN:   --iree-abi-transformation-pipeline \
-// RUN:   --iree-flow-transformation-pipeline \
-// RUN:   --iree-stream-transformation-pipeline \
-// RUN:   --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmcpu-lower-executable-target)))' \
+// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
+// RUN:   --iree-opt-data-tiling=false \
+// RUN:   --compile-to=executable-configurations | \
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs,iree-llvmcpu-lower-executable-target)))' \
 // RUN:   --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir \
 // RUN:   --iree-codegen-use-transform-dialect-strategy=codegen | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
 
 // CODEGEN-DEFAULT:     hal.executable.export public @matmul_static_dispatch_0_matmul_3x3x5
-// CODEGEN-DEFAULT:         %[[C2:.+]] = arith.constant 2 : index
-// CODEGEN-DEFAULT:         %[[C1:.+]] = arith.constant 1 : index
+// CODEGEN-DEFAULT-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CODEGEN-DEFAULT-DAG:     %[[C2:.+]] = arith.constant 2 : index
 // CODEGEN-DEFAULT:         hal.return %[[C2]], %[[C1]], %[[C1]]
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \

diff --git a/tools/test/compile_to_phase.mlir b/tools/test/compile_to_phase.mlir
index 0e2b853..0390564 100644
--- a/tools/test/compile_to_phase.mlir
+++ b/tools/test/compile_to_phase.mlir

@@ -1,9 +1,9 @@
 // RUN: iree-compile --compile-to=input %s | FileCheck %s --check-prefix=INPUT-PHASE
-// INPUT-PHASE: func.func @abs(%[[ARG0:.+]]: tensor<f32>)
+// INPUT-PHASE: util.func public @abs(%[[ARG0:.+]]: tensor<f32>)
 // INPUT-PHASE: math.absf %[[ARG0]] : tensor<f32>
 
 // RUN: iree-compile --compile-to=abi %s | FileCheck %s --check-prefix=ABI-PHASE
-// ABI-PHASE: func.func @abs(%[[ARG0:.+]]: !hal.buffer_view)
+// ABI-PHASE: util.func public @abs(%[[ARG0:.+]]: !hal.buffer_view)
 // ABI-PHASE: %[[INPUT:.+]] = hal.tensor.import %[[ARG0]] "input0" : !hal.buffer_view -> tensor<f32>
 // ABI-PHASE: math.absf %[[INPUT]] : tensor<f32>
commit	045bca1e213ea9ab4210277776430b497b64be10	[log] [tgz]
author	Ben Vanik <ben.vanik@gmail.com>	Thu Feb 15 15:53:12 2024 -0800
committer	GitHub <noreply@github.com>	Thu Feb 15 23:53:12 2024 +0000
tree	daa0baaa2d024f85cbb9fb1f9aed0246388d968f
parent	1ee6007a2c07b1b251bc4393e6778b13a278d33c [diff]