Removing -iree-flow-experimental-dispatch-reduce and the reduction region ops.

PiperOrigin-RevId: 300161044
diff --git a/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index 7e02912..dcebd2c 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
@@ -354,277 +354,6 @@
 }
 
 //===----------------------------------------------------------------------===//
-// flow.reduction.region
-//===----------------------------------------------------------------------===//
-
-void ReductionRegionOp::build(Builder *builder, OperationState &state,
-                              ArrayRef<Type> resultTypes, Value workload,
-                              ValueRange operands, ValueRange initialValues,
-                              ArrayRef<int32_t> dimensions,
-                              ArrayRef<NamedAttribute> attributes) {
-  state.addTypes(resultTypes);
-  state.addOperands({workload});
-  state.addOperands(operands);
-  state.addOperands(initialValues);
-  state.addAttribute(
-      "dimensions",
-      DenseIntElementsAttr::get(
-          VectorType::get({static_cast<int32_t>(dimensions.size())},
-                          builder->getIntegerType(32)),
-          dimensions));
-  state.addAttributes(attributes);
-  state.addRegion();
-  state.addRegion();
-  state.setOperandListToResizable();
-}
-
-ParseResult parseReductionRegionOp(OpAsmParser &parser,
-                                   OperationState *result) {
-  OpAsmParser::OperandType workloadArg;
-  Type workloadArgType;
-  if (failed(parser.parseLSquare()) ||
-      failed(parser.parseOperand(workloadArg)) ||
-      failed(parser.parseColonType(workloadArgType)) ||
-      failed(parser.parseRSquare()) ||
-      failed(parser.resolveOperand(workloadArg, workloadArgType,
-                                   result->operands))) {
-    return failure();
-  }
-
-  if (failed(parser.parseLParen())) {
-    return failure();
-  }
-  SmallVector<OpAsmParser::OperandType, 8> regionArgs;
-  SmallVector<OpAsmParser::OperandType, 8> reductionOperands;
-  SmallVector<Type, 8> reductionOperandTypes;
-  auto operandsLoc = parser.getCurrentLocation();
-  do {
-    // Reserve entries in the lists.
-    regionArgs.emplace_back();
-    reductionOperands.emplace_back();
-    reductionOperandTypes.emplace_back();
-    if (failed(parser.parseRegionArgument(regionArgs.back())) ||
-        failed(parser.parseEqual()) ||
-        failed(parser.parseOperand(reductionOperands.back())) ||
-        failed(parser.parseColonType(reductionOperandTypes.back()))) {
-      return failure();
-    }
-  } while (succeeded(parser.parseOptionalComma()));
-  if (failed(parser.parseRParen()) ||
-      failed(parser.parseArrowTypeList(result->types)) ||
-      failed(parser.resolveOperands(reductionOperands, reductionOperandTypes,
-                                    operandsLoc, result->operands))) {
-    return failure();
-  }
-  result->setOperandListToResizable();
-
-  Region *dispatchRegion = result->addRegion();
-  if (failed(parser.parseRegion(*dispatchRegion, regionArgs,
-                                reductionOperandTypes))) {
-    return failure();
-  }
-
-  SmallVector<OpAsmParser::OperandType, 4> invocationRegionArgs;
-  SmallVector<Type, 4> invocationRegionArgTypes;
-  if (failed(parser.parseKeyword("invocation")) ||
-      failed(parser.parseLParen())) {
-    return failure();
-  }
-  do {
-    Type argType;
-    SmallVector<OpAsmParser::OperandType, 2> reductionRegionArgs;
-    if (failed(parser.parseLParen()) ||
-        failed(parser.parseOperandList(reductionRegionArgs, 2)) ||
-        failed(parser.parseRParen()) ||
-        failed(parser.parseColonType(argType))) {
-      return failure();
-    }
-    invocationRegionArgs.push_back(reductionRegionArgs[0]);
-    invocationRegionArgTypes.push_back(argType);
-    invocationRegionArgs.push_back(reductionRegionArgs[1]);
-    invocationRegionArgTypes.push_back(argType);
-  } while (succeeded(parser.parseOptionalComma()));
-  SmallVector<Type, 4> invocationResultTypes;
-  if (failed(parser.parseRParen()) ||
-      failed(parser.parseArrowTypeList(invocationResultTypes))) {
-    return failure();
-  }
-
-  // Parse invocation body.
-  Region *invocationRegion = result->addRegion();
-  if (failed(parser.parseRegion(*invocationRegion, invocationRegionArgs,
-                                invocationRegionArgTypes)) ||
-      failed(parser.parseOptionalAttrDict(result->attributes))) {
-    return failure();
-  }
-
-  return success();
-}
-
-void printReductionRegionOp(OpAsmPrinter &p, ReductionRegionOp op) {
-  p << op.getOperationName();
-
-  // Print the workload argument.
-  p << "[";
-  p.printOperand(op.workload());
-  p << " : ";
-  p.printType(op.workload().getType());
-  p << "]";
-
-  auto &dispatchBlock = op.dispatch().front();
-  p << "(";
-  interleaveComma(llvm::zip(dispatchBlock.getArguments(), op.operands()), p,
-                  [&](std::tuple<BlockArgument, Value> it) {
-                    p << std::get<0>(it) << " = " << std::get<1>(it);
-                    p << " : ";
-                    p << std::get<1>(it).getType();
-                  });
-  p << ", ";
-  interleaveComma(
-      llvm::zip(dispatchBlock.getArguments().slice(op.operands().size()),
-                op.initial_values()),
-      p, [&](std::tuple<BlockArgument, Value> it) {
-        p << std::get<0>(it) << " = " << std::get<1>(it);
-        p << " : ";
-        p << std::get<1>(it).getType();
-      });
-  p << ")";
-  if (op.getNumResults() > 0) {
-    p << " -> ";
-    if (op.getNumResults() > 1) p << "(";
-    interleaveComma(op.getResultTypes(), p);
-    if (op.getNumResults() > 1) p << ")";
-  }
-  p.printRegion(op.dispatch(), /*printEntryBlockArgs=*/false);
-
-  p << " invocation(";
-  auto invocationType = op.getInvocationType();
-  auto &entryBlock = op.invocation().getBlocks().front();
-  int regionArgIndex = 0;
-  interleaveComma(invocationType.getInputs(), p, [&](Type operandType) {
-    p << "(";
-    p.printOperand(entryBlock.getArgument(regionArgIndex++));
-    p << ", ";
-    p.printOperand(entryBlock.getArgument(regionArgIndex++));
-    p << ") : ";
-    p.printType(operandType);
-  });
-  p << ")";
-  p.printArrowTypeList(invocationType.getResults());
-  p.printRegion(op.invocation(), /*printEntryBlockArgs=*/false);
-
-  p.printOptionalAttrDict(op.getAttrs(),
-                          /*elidedAttrs=*/{});
-}
-
-FunctionType ReductionRegionOp::getInvocationType() {
-  return FunctionType::get(llvm::to_vector<4>(initial_values().getTypes()),
-                           llvm::to_vector<4>(initial_values().getTypes()),
-                           getContext());
-}
-
-//===----------------------------------------------------------------------===//
-// flow.windowed_reduction.region
-//===----------------------------------------------------------------------===//
-
-void WindowedReductionRegionOp::build(
-    Builder *builder, OperationState &state, ArrayRef<Type> resultTypes,
-    Value workload, ValueRange operands, ValueRange initialValues,
-    ArrayRef<int32_t> windowDimensions, ArrayRef<int32_t> windowStrides,
-    ArrayRef<int32_t> baseDilations, ArrayRef<int32_t> windowDilations,
-    PaddingMode paddingMode, ArrayRef<NamedAttribute> attributes) {
-  state.addTypes(resultTypes);
-  state.addOperands({workload});
-  state.addOperands(operands);
-  state.addOperands(initialValues);
-  state.addAttribute(
-      "window_dimensions",
-      DenseIntElementsAttr::get(
-          VectorType::get({static_cast<int32_t>(windowDimensions.size())},
-                          builder->getIntegerType(32)),
-          windowDimensions));
-  state.addAttribute(
-      "window_strides",
-      DenseIntElementsAttr::get(
-          VectorType::get({static_cast<int32_t>(windowStrides.size())},
-                          builder->getIntegerType(32)),
-          windowStrides));
-  state.addAttribute(
-      "base_dilations",
-      DenseIntElementsAttr::get(
-          VectorType::get({static_cast<int32_t>(baseDilations.size())},
-                          builder->getIntegerType(32)),
-          baseDilations));
-  state.addAttribute(
-      "window_dilations",
-      DenseIntElementsAttr::get(
-          VectorType::get({static_cast<int32_t>(windowDilations.size())},
-                          builder->getIntegerType(32)),
-          windowDilations));
-  state.addAttribute("padding_mode", builder->getI32IntegerAttr(
-                                         static_cast<int32_t>(paddingMode)));
-  state.addAttributes(attributes);
-  state.addRegion();
-  state.addRegion();
-  state.setOperandListToResizable();
-}
-
-ParseResult parseWindowedReductionRegionOp(OpAsmParser &parser,
-                                           OperationState *result) {
-  return parseReductionRegionOp(parser, result);
-}
-
-void printWindowedReductionRegionOp(OpAsmPrinter &p,
-                                    WindowedReductionRegionOp op) {
-  p << op.getOperationName();
-
-  // Print the workload argument.
-  p << "[";
-  p.printOperand(op.workload());
-  p << " : ";
-  p.printType(op.workload().getType());
-  p << "]";
-
-  p << "(";
-  p.printOperands(op.operands());
-  p << ", ";
-  p.printOperands(op.initial_values());
-  p << ")";
-  if (op.getNumResults() > 0) {
-    p << " -> ";
-    if (op.getNumResults() > 1) p << "(";
-    interleaveComma(op.getResultTypes(), p);
-    if (op.getNumResults() > 1) p << ")";
-  }
-  p.printRegion(op.dispatch(), /*printEntryBlockArgs=*/false);
-
-  p << " invocation(";
-  auto invocationType = op.getInvocationType();
-  auto &entryBlock = op.invocation().getBlocks().front();
-  int regionArgIndex = 0;
-  interleaveComma(invocationType.getInputs(), p, [&](Type operandType) {
-    p << "(";
-    p.printOperand(entryBlock.getArgument(regionArgIndex++));
-    p << ", ";
-    p.printOperand(entryBlock.getArgument(regionArgIndex++));
-    p << ") : ";
-    p.printType(operandType);
-  });
-  p << ")";
-  p.printArrowTypeList(invocationType.getResults());
-  p.printRegion(op.invocation(), /*printEntryBlockArgs=*/false);
-
-  p.printOptionalAttrDict(op.getAttrs(),
-                          /*elidedAttrs=*/{});
-}
-
-FunctionType WindowedReductionRegionOp::getInvocationType() {
-  return FunctionType::get(llvm::to_vector<4>(initial_values().getTypes()),
-                           llvm::to_vector<4>(initial_values().getTypes()),
-                           getContext());
-}
-
-//===----------------------------------------------------------------------===//
 // flow.executable
 //===----------------------------------------------------------------------===//
 
@@ -714,75 +443,6 @@
 }
 
 //===----------------------------------------------------------------------===//
-// flow.reduction.entry / flow.windowed_reduction.entry
-//===----------------------------------------------------------------------===//
-
-static ParseResult parseReductionEntryOp(OpAsmParser &parser,
-                                         OperationState *result) {
-  FlatSymbolRefAttr functionRefAttr;
-  FlatSymbolRefAttr applyRefAttr;
-  if (failed(parser.parseAttribute(functionRefAttr, "function_ref",
-                                   result->attributes)) ||
-      failed(parser.parseKeyword("apply")) || failed(parser.parseLParen()) ||
-      failed(parser.parseAttribute(applyRefAttr, "apply_ref",
-                                   result->attributes)) ||
-      failed(parser.parseRParen())) {
-    return failure();
-  }
-
-  if (succeeded(parser.parseOptionalKeyword("as"))) {
-    StringAttr exportNameAttr;
-    if (failed(parser.parseLParen()) ||
-        failed(parser.parseAttribute(exportNameAttr, "sym_name",
-                                     result->attributes)) ||
-        failed(parser.parseRParen())) {
-      return failure();
-    }
-  } else {
-    result->addAttribute("sym_name", parser.getBuilder().getStringAttr(
-                                         functionRefAttr.getValue()));
-  }
-
-  if (failed(parser.parseOptionalAttrDictWithKeyword(result->attributes))) {
-    return failure();
-  }
-
-  return success();
-}
-
-static void printReductionEntryOp(OpAsmPrinter &p, ReductionEntryOp op) {
-  p << op.getOperationName() << ' ';
-  p.printSymbolName(op.function_ref());
-  p << " apply(";
-  p.printSymbolName(op.apply_ref());
-  p << ")";
-  if (op.sym_name() != op.function_ref()) {
-    p << " as(\"" << op.sym_name() << "\")";
-  }
-  p.printOptionalAttrDictWithKeyword(
-      op.getAttrs(), /*elidedAttrs=*/{"apply_ref", "function_ref", "sym_name"});
-}
-
-static ParseResult parseWindowedReductionEntryOp(OpAsmParser &parser,
-                                                 OperationState *result) {
-  return parseReductionEntryOp(parser, result);
-}
-
-static void printWindowedReductionEntryOp(OpAsmPrinter &p,
-                                          WindowedReductionEntryOp op) {
-  p << op.getOperationName() << ' ';
-  p.printSymbolName(op.function_ref());
-  p << " apply(";
-  p.printSymbolName(op.apply_ref());
-  p << ")";
-  if (op.sym_name() != op.function_ref()) {
-    p << " as(\"" << op.sym_name() << "\")";
-  }
-  p.printOptionalAttrDictWithKeyword(
-      op.getAttrs(), /*elidedAttrs=*/{"apply_ref", "function_ref", "sym_name"});
-}
-
-//===----------------------------------------------------------------------===//
 // flow.dispatch
 //===----------------------------------------------------------------------===//
 
diff --git a/iree/compiler/Dialect/Flow/IR/FlowOps.td b/iree/compiler/Dialect/Flow/IR/FlowOps.td
index 452d0c4..23a5616 100644
--- a/iree/compiler/Dialect/Flow/IR/FlowOps.td
+++ b/iree/compiler/Dialect/Flow/IR/FlowOps.td
@@ -204,119 +204,6 @@
   ];
 }
 
-def FLOW_ReductionRegionOp : FLOW_PureOp<"reduction.region", [
-    IsolatedFromAbove,
-    SameVariadicOperandSize,
-    // TODO(benvanik): verify operands and initial values have the same element
-    // types (but NOT the same shapes).
-  ]> {
-  let summary = [{partitioned reduction region}];
-  let description = [{
-    A closure that defines a reduction operation over one or more inputs.
-    Reductions are dispatches with very specific semantics around the indexing
-    of work. Parititoning first isolates reduction regions prior to dispatch
-    regions so that such semantics can be identified for folding.
-
-    The `dispatch` region contains the original reduction operation and the
-    `invocation` region contains the outlined per-invocation operation.
-
-    This operation follows the XLA Reduce semantics:
-    https://www.tensorflow.org/xla/operation_semantics#reduce
-  }];
-
-  let arguments = (ins
-    FLOW_Workload:$workload,
-    Variadic<AnyType>:$operands,
-    Variadic<AnyType>:$initial_values,
-    // TODO(benvanik): use index types instead of i32.
-    OptionalAttr<I32ElementsAttr>:$dimensions
-  );
-  let results = (outs
-    Variadic<AnyType>:$results
-  );
-
-  let regions = (region
-    AnyRegion:$dispatch,
-    AnyRegion:$invocation
-  );
-
-  let extraClassDeclaration = [{
-    unsigned getNumReductionOperands() {
-      return std::distance(operands().begin(), operands().end());
-    }
-    FunctionType getInvocationType();
-  }];
-
-  let skipDefaultBuilders = 1;
-  let builders = [
-    OpBuilder<[{
-      Builder *builder, OperationState &state, ArrayRef<Type> resultTypes,
-      Value workload, ValueRange operands,
-      ValueRange initialValues, ArrayRef<int32_t> dimensions,
-      ArrayRef<NamedAttribute> attributes = {}
-    }]>,
-  ];
-}
-
-def FLOW_WindowedReductionRegionOp : FLOW_PureOp<"windowed_reduction.region", [
-    SameVariadicOperandSize,
-    // TODO(benvanik): verify operands and initial values have the same element
-    // types (but NOT the same shapes).
-  ]> {
-  let summary = [{partitioned reduction region}];
-  let description = [{
-    A closure that defines a reduction operation over one or more inputs.
-    Reductions are dispatches with very specific semantics around the indexing
-    of work. Parititoning first isolates reduction regions prior to dispatch
-    regions so that such semantics can be identified for folding.
-
-    The `dispatch` region contains the original reduction operation and the
-    `invocation` region contains the outlined per-invocation operation.
-
-    This operation follows the XLA ReduceWindow semantics:
-    https://www.tensorflow.org/xla/operation_semantics#reducewindow
-  }];
-
-  let arguments = (ins
-    FLOW_Workload:$workload,
-    Variadic<FLOW_Tensor>:$operands,
-    Variadic<FLOW_Tensor>:$initial_values,
-    // TODO(benvanik): use index types instead of i32.
-    I32ElementsAttr:$window_dimensions,
-    I32ElementsAttr:$window_strides,
-    I32ElementsAttr:$base_dilations,
-    I32ElementsAttr:$window_dilations,
-    FLOW_PaddingModeAttr:$padding_mode
-  );
-  let results = (outs
-    Variadic<FLOW_Tensor>:$results
-  );
-
-  let regions = (region
-    AnyRegion:$dispatch,
-    AnyRegion:$invocation
-  );
-
-  let extraClassDeclaration = [{
-    unsigned getNumReductionOperands() {
-      return std::distance(operands().begin(), operands().end());
-    }
-    FunctionType getInvocationType();
-  }];
-
-  let skipDefaultBuilders = 1;
-  let builders = [
-    OpBuilder<[{
-      Builder *builder, OperationState &state, ArrayRef<Type> resultTypes,
-      Value workload, ValueRange operands,
-      ValueRange initialValues, ArrayRef<int32_t> windowDimensions,
-      ArrayRef<int32_t> windowStrides, ArrayRef<int32_t> baseDilations,
-      ArrayRef<int32_t> windowDilations, PaddingMode paddingMode,
-      ArrayRef<NamedAttribute> attributes = {}
-    }]>,
-  ];
-}
-
 def FLOW_ReturnOp : FLOW_Op<"return", [Terminator]> {
   let summary = [{return from a flow.dispatch_region}];
   let description = [{
@@ -407,60 +294,6 @@
   );
 }
 
-def FLOW_ReductionEntryOp : FLOW_Op<"reduction.entry", [
-    HasParent<"IREE::Flow::ExecutableOp">,
-    Symbol,
-  ]> {
-  let summary = [{defines an executable entry point for reduction operations}];
-  let description = [{
-    Specifies an exported function with an externally-visible alias. Multiple
-    exports can reference the same internal function. The computation represents
-    a reduction operation that has additional backend-specific semantics that
-    need to be lowered.
-
-    This operation follows the XLA Reduce semantics:
-    https://www.tensorflow.org/xla/operation_semantics#reduce
-  }];
-
-  // TODO(benvanik): add a list of all used workloads.
-  let arguments = (ins
-    // TODO(benvanik): ref into child module.
-    StrAttr:$sym_name,
-    FlatSymbolRefAttr:$function_ref,
-    FlatSymbolRefAttr:$apply_ref,
-    I32Attr:$dimension
-  );
-}
-
-def FLOW_WindowedReductionEntryOp : FLOW_Op<"windowed_reduction.entry", [
-    HasParent<"IREE::Flow::ExecutableOp">,
-    Symbol,
-  ]> {
-  let summary = [{defines an executable entry point for reduction operations}];
-  let description = [{
-    Specifies an exported function with an externally-visible alias. Multiple
-    exports can reference the same internal function. The computation represents
-    a reduction operation that has additional backend-specific semantics that
-    need to be lowered.
-
-    This operation follows the XLA ReduceWindow semantics:
-    https://www.tensorflow.org/xla/operation_semantics#reducewindow
-  }];
-
-  // TODO(benvanik): add a list of all used workloads.
-  let arguments = (ins
-    // TODO(benvanik): ref into child module.
-    StrAttr:$sym_name,
-    FlatSymbolRefAttr:$function_ref,
-    FlatSymbolRefAttr:$apply_ref,
-    I32Attr:$window_dimension,
-    I32Attr:$window_stride,
-    I32Attr:$base_dilation,
-    I32Attr:$window_dilation,
-    FLOW_PaddingModeAttr:$padding
-  );
-}
-
 //===----------------------------------------------------------------------===//
 // Dispatch ops
 //===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Dialect/Flow/IR/test/executable_ops.mlir b/iree/compiler/Dialect/Flow/IR/test/executable_ops.mlir
index 94d7908..4494dff 100644
--- a/iree/compiler/Dialect/Flow/IR/test/executable_ops.mlir
+++ b/iree/compiler/Dialect/Flow/IR/test/executable_ops.mlir
@@ -16,24 +16,3 @@
   // CHECK: flow.dispatch.entry @dispatch0 as("dispatch0_alias")
   flow.dispatch.entry @dispatch0 as("dispatch0_alias")
 }
-
-// -----
-
-// CHECK-LABEL: @reduction_ex
-flow.executable @reduction_ex {
-  // CHECK: module {
-  module {
-    // CHECK: @entry
-    func @entry(tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-    // CHECK: @apply
-    func @apply(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-      %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-      return %0 : tensor<f32>
-    }
-  }
-  // CHECK: flow.reduction.entry @entry
-  // CHECK-SAME: apply(@apply)
-  // CHECK-SAME: as("entry_alias")
-  // CHECK-SAME: attributes {dimension = 1 : i32}
-  flow.reduction.entry @entry apply(@apply) as("entry_alias") attributes {dimension = 1 : i32}
-}
diff --git a/iree/compiler/Dialect/Flow/IR/test/reduction_regions.mlir b/iree/compiler/Dialect/Flow/IR/test/reduction_regions.mlir
deleted file mode 100644
index 12ea0e6..0000000
--- a/iree/compiler/Dialect/Flow/IR/test/reduction_regions.mlir
+++ /dev/null
@@ -1,68 +0,0 @@
-// Tests printing and parsing of reduction region ops.
-
-// RUN: iree-opt -split-input-file %s | iree-opt -split-input-file | IreeFileCheck %s
-
-// CHECK-LABEL: @singleReduction
-func @singleReduction(%arg0 : tensor<5x1xf32>) {
-  // CHECK: [[WORKLOAD:%.+]] = "some.shape"(%arg0) : (tensor<5x1xf32>) -> vector<3xi32>
-  %workload = "some.shape"(%arg0) : (tensor<5x1xf32>) -> vector<3xi32>
-  // CHECK: [[INITIALF:%.+]] = "some.constant"() : () -> tensor<f32>
-  %initialValueF = "some.constant"() : () -> tensor<f32>
-  //      CHECK: = flow.reduction.region[
-  // CHECK-SAME:     [[WORKLOAD]] : vector<3xi32>
-  // CHECK-SAME:   ](
-  // CHECK-SAME:     %arg1 = %arg0 : tensor<5x1xf32>, %arg2 = [[INITIALF]] : tensor<f32>) -> tensor<1xf32> {
-  // CHECK-NEXT:   "xla_hlo.reduce"(%arg1, %arg2) ( {
-  //      CHECK: } invocation((%arg1, %arg2) : tensor<f32>) -> tensor<f32> {
-  // CHECK-NEXT:   = xla_hlo.add %arg1, %arg2 : tensor<f32>
-  // CHECK-NEXT:   flow.return %3 : tensor<f32>
-  // CHECK-NEXT: } {dimensions = dense<1> : vector<1xi32>}
-  %0 = flow.reduction.region[%workload : vector<3xi32>](%arg1 = %arg0 : tensor<5x1xf32>, %arg2 = %initialValueF : tensor<f32>) -> tensor<1xf32> {
-    %1 = "xla_hlo.reduce"(%arg1, %arg2) ( {
-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>): // no predecessors
-      %2 = xla_hlo.add %arg3, %arg4 : tensor<f32>
-      "xla_hlo.return"(%2) : (tensor<f32>) -> ()
-    }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<5x1xf32>, tensor<f32>) -> tensor<1xf32>
-    flow.return %1 : tensor<1xf32>
-  } invocation((%arg1, %arg2) : tensor<f32>) -> tensor<f32> {
-    %1 = xla_hlo.add %arg1, %arg2 : tensor<f32>
-    flow.return %1 : tensor<f32>
-  } {dimensions = dense<1> : vector<1xi32>}
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @fusedReduction
-func @fusedReduction(%arg0 : tensor<4x8xf32>, %arg1 : tensor<4x8xi32>) {
-  // CHECK: [[WORKLOAD:%.+]] = "some.shape"(%arg0) : (tensor<4x8xf32>) -> vector<3xi32>
-  %workload = "some.shape"(%arg0) : (tensor<4x8xf32>) -> vector<3xi32>
-  // CHECK: [[INITIALF:%.+]] = "some.constant"() : () -> tensor<f32>
-  // CHECK: [[INITIALI:%.+]] = "some.constant"() : () -> tensor<i32>
-  %initialValueF = "some.constant"() : () -> tensor<f32>
-  %initialValueI = "some.constant"() : () -> tensor<i32>
-  //      CHECK: = flow.reduction.region[
-  // CHECK-SAME:       [[WORKLOAD]] : vector<3xi32>
-  // CHECK-SAME:     ](
-  // CHECK-SAME:       %arg2 = %arg0 : tensor<4x8xf32>, %arg3 = %arg1 : tensor<4x8xi32>, %arg4 = [[INITIALF]] : tensor<f32>, %arg5 = [[INITIALI]] : tensor<i32>) -> (tensor<4xf32>, tensor<4xi32>) {
-  // CHECK-NEXT:   "xla_hlo.reduce"(%arg2, %arg3, %arg4, %arg5) ( {
-  //      CHECK: } invocation((%arg2, %arg3) : tensor<f32>, (%arg4, %arg5) : tensor<i32>) -> (tensor<f32>, tensor<i32>) {
-  // CHECK-NEXT:   = xla_hlo.add %arg2, %arg3 : tensor<f32>
-  // CHECK-NEXT:   = xla_hlo.add %arg4, %arg5 : tensor<i32>
-  // CHECK-NEXT:   flow.return %4, %5 : tensor<f32>, tensor<i32>
-  // CHECK-NEXT: } {dimensions = dense<1> : vector<1xi32>}
-  %0:2 = flow.reduction.region[%workload : vector<3xi32>](%arg2 = %arg0 : tensor<4x8xf32>, %arg3 = %arg1 : tensor<4x8xi32>, %arg4 = %initialValueF : tensor<f32>, %arg5 = %initialValueI : tensor<i32>) -> (tensor<4xf32>, tensor<4xi32>) {
-    %1:2 = "xla_hlo.reduce"(%arg2, %arg3, %arg4, %arg5) ( {
-    ^bb0(%arg6: tensor<f32>, %arg7: tensor<i32>, %arg8: tensor<f32>, %arg9: tensor<i32>): // no predecessors
-      %2 = xla_hlo.add %arg6, %arg8 : tensor<f32>
-      %3 = xla_hlo.add %arg7, %arg9 : tensor<i32>
-      "xla_hlo.return"(%2, %3) : (tensor<f32>, tensor<i32>) -> ()
-    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<4x8xi32>, tensor<f32>, tensor<i32>) -> (tensor<4xf32>, tensor<4xi32>)
-    flow.return %1#0, %1#1 : tensor<4xf32>, tensor<4xi32>
-  } invocation((%arg2, %arg3) : tensor<f32>, (%arg4, %arg5) : tensor<i32>) -> (tensor<f32>, tensor<i32>) {
-    %1 = xla_hlo.add %arg2, %arg3 : tensor<f32>
-    %2 = xla_hlo.add %arg4, %arg5 : tensor<i32>
-    flow.return %1, %2 : tensor<f32>, tensor<i32>
-  } {dimensions = dense<1> : vector<1xi32>}
-  return
-}
diff --git a/iree/compiler/Dialect/Flow/Transforms/BUILD b/iree/compiler/Dialect/Flow/Transforms/BUILD
index 2daf940..d4982ef 100644
--- a/iree/compiler/Dialect/Flow/Transforms/BUILD
+++ b/iree/compiler/Dialect/Flow/Transforms/BUILD
@@ -26,12 +26,10 @@
         "FoldCompatibleDispatchRegions.cpp",
         "FormStreams.cpp",
         "IdentifyDispatchRegions.cpp",
-        "IdentifyReductionRegions.cpp",
         "LegalizeInputTypes.cpp",
         "MaterializeExportedReflection.cpp",
         "MergeExportedReflection.cpp",
         "OutlineDispatchRegions.cpp",
-        "OutlineReductionRegions.cpp",
         "Passes.cpp",
         "PrePostPartitioningConversion.cpp",
         "RematerializeDispatchConstants.cpp",
diff --git a/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt b/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt
index 406aea1..5c47dd9 100644
--- a/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt
+++ b/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt
@@ -26,12 +26,10 @@
     "FoldCompatibleDispatchRegions.cpp"
     "FormStreams.cpp"
     "IdentifyDispatchRegions.cpp"
-    "IdentifyReductionRegions.cpp"
     "LegalizeInputTypes.cpp"
     "MaterializeExportedReflection.cpp"
     "MergeExportedReflection.cpp"
     "OutlineDispatchRegions.cpp"
-    "OutlineReductionRegions.cpp"
     "Passes.cpp"
     "PrePostPartitioningConversion.cpp"
     "RematerializeDispatchConstants.cpp"
diff --git a/iree/compiler/Dialect/Flow/Transforms/IdentifyReductionRegions.cpp b/iree/compiler/Dialect/Flow/Transforms/IdentifyReductionRegions.cpp
deleted file mode 100644
index 41d35fe..0000000
--- a/iree/compiler/Dialect/Flow/Transforms/IdentifyReductionRegions.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-
-#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
-#include "iree/compiler/Dialect/Flow/Utils/DispatchUtils.h"
-#include "iree/compiler/Dialect/Flow/Utils/WorkloadUtils.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BlockAndValueMapping.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassRegistry.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Utils.h"
-#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
-
-namespace mlir {
-namespace iree_compiler {
-namespace IREE {
-namespace Flow {
-
-namespace {
-
-// Builds a new reduction region with the given |invocationRegion|.
-// The new region will be inserted after |originalOp|.
-//
-// All |invocationRegion| ops must be compatible with the |workload| specified
-// as they will all be dispatched with the same workgroup structure. The
-// |invocationRegion| will not be modified.
-LogicalResult buildReductionRegion(Operation *dispatchOp,
-                                   ArrayRef<Value> operands,
-                                   ArrayRef<Value> initialValues,
-                                   ArrayRef<int32_t> dimensions,
-                                   Region &invocationRegion) {
-  OpBuilder parentBuilder(dispatchOp);
-
-  // Compute the workload based on the output shape.
-  // When variadic all output shapes match so we can just take the first.
-  auto workload = calculateWorkload(dispatchOp, dispatchOp->getResult(0));
-
-  // Build the region op and add it to the parent block.
-  auto reductionRegionOp = parentBuilder.create<ReductionRegionOp>(
-      dispatchOp->getLoc(), dispatchOp->getResultTypes(), workload, operands,
-      initialValues, dimensions);
-
-  // Clone the dispatch op (xla_hlo.reduce, etc) into the dispatch region. This
-  // way we preserve the original op all the way through the pipeline while
-  // still exposing it with standardized attributes for the later scheduler
-  // passes.
-  OpBuilder dispatchBuilder(dispatchOp->getContext());
-  auto *dispatchBlock =
-      dispatchBuilder.createBlock(&reductionRegionOp.dispatch());
-  BlockAndValueMapping dispatchMapping;
-  for (auto operand : operands) {
-    dispatchMapping.map(operand, dispatchBlock->addArgument(operand.getType()));
-  }
-  for (auto initialValue : initialValues) {
-    dispatchMapping.map(initialValue,
-                        dispatchBlock->addArgument(initialValue.getType()));
-  }
-  auto *clonedOp = dispatchBuilder.clone(*dispatchOp, dispatchMapping);
-  dispatchBuilder.create<ReturnOp>(dispatchOp->getLoc(),
-                                   clonedOp->getResults());
-
-  // Create the block and setup the arg mapping for captured values.
-  BlockAndValueMapping invocationMapping;
-  invocationRegion.cloneInto(&reductionRegionOp.invocation(),
-                             invocationMapping);
-
-  // Replace xla_hlo.return -> flow.return.
-  OpBuilder regionBuilder(reductionRegionOp.invocation());
-  reductionRegionOp.invocation().walk([&](xla_hlo::ReturnOp returnOp) {
-    regionBuilder.setInsertionPoint(returnOp);
-    regionBuilder.create<ReturnOp>(returnOp.getLoc(), returnOp.getOperands());
-    returnOp.erase();
-  });
-
-  // Replace usage of values with the results of the region.
-  for (int i = 0; i < dispatchOp->getNumResults(); ++i) {
-    dispatchOp->getResult(i).replaceAllUsesWith(reductionRegionOp.getResult(i));
-  }
-
-  return success();
-}
-
-// Converts an xla_hlo::ReduceOp to a reduction region and inlines the target
-// computation into the region body.
-LogicalResult buildReductionRegionFromXLAReduceOp(xla_hlo::ReduceOp reduceOp) {
-  SmallVector<Value, 4> operands(reduceOp.getOperands());
-  OperandAdaptor<xla_hlo::ReduceOp> adaptor(operands);
-
-  SmallVector<int32_t, 4> dimensions;
-  for (auto dim : reduceOp.dimensions().getIntValues()) {
-    dimensions.push_back(dim.getSExtValue());
-  }
-
-  // Create the reduction region op with the reduction computation.
-  if (failed(buildReductionRegion(reduceOp, adaptor.operands(),
-                                  adaptor.init_values(), dimensions,
-                                  reduceOp.body()))) {
-    return failure();
-  }
-
-  // Remove original XLA reduction op.
-  reduceOp.erase();
-
-  return success();
-}
-
-// Identifies reduction ops and moves them into reduction regions.
-LogicalResult identifyBlockReductionRegions(FuncOp funcOp, Block *block) {
-  // Fixed point iteration until we can no longer fuse anything.
-  bool didFindAnyNewRegions;
-  do {
-    // Iterate in reverse so we root further along in the op list.
-    didFindAnyNewRegions = false;
-    for (auto &rootOp : llvm::reverse(*block)) {
-      if (auto reduceOp = dyn_cast<xla_hlo::ReduceOp>(rootOp)) {
-        if (failed(buildReductionRegionFromXLAReduceOp(reduceOp))) {
-          return failure();
-        }
-
-        // Successfully created a dispatch region from the ops and we must now
-        // start over again as we've likely trashed the whole block structure.
-        didFindAnyNewRegions = true;
-        break;
-      }
-    }
-  } while (didFindAnyNewRegions);
-  return success();
-}
-
-}  // namespace
-
-// Identifies reduction ops and moves their targets into reduction regions.
-class IdentifyReductionRegionsPass
-    : public ModulePass<IdentifyReductionRegionsPass> {
- public:
-  void runOnModule() override {
-    for (auto funcOp : getModule().getOps<FuncOp>()) {
-      for (auto &block : funcOp) {
-        if (failed(identifyBlockReductionRegions(funcOp, &block))) {
-          return signalPassFailure();
-        }
-      }
-    }
-  }
-};
-
-std::unique_ptr<OpPassBase<ModuleOp>> createIdentifyReductionRegionsPass() {
-  return std::make_unique<IdentifyReductionRegionsPass>();  // NOLINT
-}
-
-static PassRegistration<IdentifyReductionRegionsPass> pass(
-    "iree-flow-identify-reduction-regions",
-    "Identifies reduction regions based on input reduction ops");
-
-}  // namespace Flow
-}  // namespace IREE
-}  // namespace iree_compiler
-}  // namespace mlir
diff --git a/iree/compiler/Dialect/Flow/Transforms/OutlineReductionRegions.cpp b/iree/compiler/Dialect/Flow/Transforms/OutlineReductionRegions.cpp
deleted file mode 100644
index bee1284..0000000
--- a/iree/compiler/Dialect/Flow/Transforms/OutlineReductionRegions.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <utility>
-
-#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
-#include "iree/compiler/Dialect/Flow/Utils/DispatchUtils.h"
-#include "iree/compiler/Dialect/Flow/Utils/WorkloadUtils.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace iree_compiler {
-namespace IREE {
-namespace Flow {
-
-namespace {
-
-// Determines the shapes involved with reducing this dimension.
-SmallVector<int64_t, 4> calculateResultShape(Value input, int windowDimension) {
-  SmallVector<int64_t, 4> resultShape;
-  for (auto it :
-       llvm::enumerate(input.getType().cast<ShapedType>().getShape())) {
-    if (it.index() != windowDimension) {
-      resultShape.push_back(it.value());
-    }
-  }
-  return resultShape;
-}
-
-// Converts a reduction_region into a dispatch to the outlined region function
-// for a single reduction dimension.
-// Returns the results of the reduction or empty if the construction fails.
-SmallVector<Value, 4> convertToDispatchOp(
-    Operation *regionOp, ExecutableOp executableOp, StringRef entryPointName,
-    int reductionDimension, SmallVector<Value, 4> initialValues,
-    SmallVector<Value, 4> inputs, OpBuilder &dispatcherBuilder) {
-  SmallVector<Type, 4> resultTypes;
-  for (auto resultType : llvm::enumerate(regionOp->getResultTypes())) {
-    // Allocate output buffer in the dispatcher to pass in to the region.
-    auto shapedType = resultType.value().cast<ShapedType>();
-    auto reducedType = RankedTensorType::get(
-        calculateResultShape(inputs[resultType.index()], reductionDimension),
-        shapedType.getElementType());
-    resultTypes.push_back(reducedType);
-  }
-
-  // Calculate workload from the result shape.
-  auto workload = calculateWorkload(regionOp, regionOp->getResult(0));
-
-  // Create the reduce op to the executable function.
-  std::vector<Value> allOperands;
-  allOperands.insert(allOperands.end(), inputs.begin(), inputs.end());
-  allOperands.insert(allOperands.end(), initialValues.begin(),
-                     initialValues.end());
-  auto dispatchOp = dispatcherBuilder.create<DispatchOp>(
-      regionOp->getLoc(), executableOp.getName(), entryPointName, workload,
-      resultTypes, allOperands);
-
-  return dispatchOp.getResults();
-}
-
-// Creates an executable that holds the given elemental reduction region.
-// The executable will have an entry point taking the specified reduction values
-// and writing the results to output arguments.
-std::pair<ExecutableOp, ReductionEntryOp> createReductionExecutable(
-    ReductionRegionOp regionOp, int outlinedRegionOrdinal,
-    int separatedReductionIndex, int reductionDimension,
-    SmallVector<Value, 4> initialValues, SmallVector<Value, 4> inputs,
-    llvm::StringMap<FuncOp> &dispatchableFuncOps) {
-  // Create a new entry point that we can use with the signature for this
-  // single dimension we are operating on.
-  SmallVector<Type, 8> allOperandTypes;
-  auto inputTypes =
-      llvm::map_range(inputs, [](Value value) { return value.getType(); });
-  allOperandTypes.append(inputTypes.begin(), inputTypes.end());
-  auto initialValueTypes = llvm::map_range(
-      initialValues, [](Value value) { return value.getType(); });
-  allOperandTypes.append(initialValueTypes.begin(), initialValueTypes.end());
-  SmallVector<Type, 4> resultTypes;
-  for (auto resultType : llvm::enumerate(regionOp.getResultTypes())) {
-    auto shapedType = resultType.value().cast<ShapedType>();
-    auto reducedType = RankedTensorType::get(
-        calculateResultShape(inputs[resultType.index()], reductionDimension),
-        shapedType.getElementType());
-    resultTypes.push_back(reducedType);
-  }
-  auto dispatchFuncType =
-      FunctionType::get(allOperandTypes, resultTypes, regionOp.getContext());
-
-  // Create the dispatch and invocation functions.
-  auto parentFuncOp = regionOp.getParentOfType<FuncOp>();
-  std::string namePrefix = parentFuncOp.getName().str() + "_reduce_" +
-                           std::to_string(outlinedRegionOrdinal) + "_dim_" +
-                           std::to_string(separatedReductionIndex);
-  std::string dispatchFuncName = namePrefix + "_dispatch";
-  auto dispatchFuncOp =
-      FuncOp::create(regionOp.getLoc(), dispatchFuncName, dispatchFuncType);
-  std::string invocationFuncName = namePrefix + "_invocation";
-  auto invocationFuncOp = createRegionFunction(
-      regionOp.getLoc(), invocationFuncName, regionOp.invocation());
-
-  // Create the executable with the region cloned into it.
-  auto executableOp = createExecutable(
-      regionOp.getLoc(), namePrefix, {dispatchFuncOp, invocationFuncOp},
-      parentFuncOp.getParentOfType<ModuleOp>(), dispatchableFuncOps);
-  executableOp.getOperation()->moveBefore(parentFuncOp);
-
-  // Add dispatch export pointing at the function.
-  OpBuilder builder(executableOp.body());
-  auto entryPointOp = builder.create<ReductionEntryOp>(
-      regionOp.getLoc(), builder.getStringAttr(dispatchFuncOp.getName()),
-      builder.getSymbolRefAttr(dispatchFuncOp),
-      builder.getSymbolRefAttr(invocationFuncOp),
-      builder.getI32IntegerAttr(reductionDimension));
-
-  return {executableOp, entryPointOp};
-}
-
-// Outlines a reduction region into one or more executables.
-// This separates the reduction into multiple dispatches, one for each reduction
-// dimension (thankfully XLA's operation semantics state this is ok). We then
-// special case the first dispatch such that it takes the constant initial
-// values so that we don't have to materialize a buffer for them.
-LogicalResult outlineReductionRegion(
-    ReductionRegionOp regionOp, int outlinedRegionOrdinal,
-    llvm::StringMap<FuncOp> &dispatchableFuncOps) {
-  // Insert at the same place as the original region.
-  OpBuilder dispatcherBuilder(regionOp);
-
-  SmallVector<Value, 4> temps{regionOp.operands()};
-
-  // Create one dispatch per dimension being reduced.
-  // We'll do this by chaining the original input through with the temporary
-  // reduction results. The results we end up with will be the originally
-  // requested shape and we can just substitute them.
-  auto dimensions = regionOp.dimensions().getValue();
-  SmallVector<int32_t, 4> sortedDimensions;
-  for (uint32_t i = 0; i < dimensions.getNumElements(); ++i) {
-    sortedDimensions.push_back(dimensions.getValue<IntegerAttr>({i}).getInt());
-  }
-  llvm::sort(sortedDimensions,
-             [](int32_t a, int32_t b) { return (a - b) > 0; });
-  for (auto dimension : llvm::enumerate(sortedDimensions)) {
-    // Create the executable with the region cloned into it.
-    ExecutableOp executableOp;
-    ReductionEntryOp entryPointOp;
-    std::tie(executableOp, entryPointOp) = createReductionExecutable(
-        regionOp, outlinedRegionOrdinal, dimension.index(), dimension.value(),
-        regionOp.initial_values(), temps, dispatchableFuncOps);
-
-    // Finally convert the dispatch region into a dispatch to the outlined func.
-    temps = convertToDispatchOp(regionOp, executableOp, entryPointOp.getName(),
-                                dimension.value(), regionOp.initial_values(),
-                                std::move(temps), dispatcherBuilder);
-    if (temps.empty()) {
-      return regionOp.emitOpError()
-             << "failed to construct reduction for dimension "
-             << dimension.value();
-    }
-  }
-
-  // Replace uses of the existing results with the new results.
-  for (int i = 0; i < regionOp.getNumResults(); ++i) {
-    regionOp.getResult(i).replaceAllUsesWith(temps[i]);
-  }
-
-  // Erase original region.
-  regionOp.erase();
-
-  return success();
-}
-
-// Creates an executable that holds the given elemental reduction region.
-// The executable will have an entry point taking the specified reduction values
-// and writing the results to output arguments.
-std::pair<ExecutableOp, WindowedReductionEntryOp>
-createWindowedReductionExecutable(
-    WindowedReductionRegionOp regionOp, int outlinedRegionOrdinal,
-    int separatedReductionIndex, int32_t windowDimension, int32_t windowStride,
-    int32_t baseDilation, int32_t windowDilation,
-    SmallVector<Value, 4> initialValues, SmallVector<Value, 4> inputs,
-    llvm::StringMap<FuncOp> &dispatchableFuncOps) {
-  // Create the dispatch and invocation functions.
-  auto parentFuncOp = regionOp.getParentOfType<FuncOp>();
-  std::string namePrefix = parentFuncOp.getName().str() + "_reduce_" +
-                           std::to_string(outlinedRegionOrdinal) + "_dim_" +
-                           std::to_string(separatedReductionIndex);
-  std::string dispatchFuncName = namePrefix + "_dispatch";
-  auto dispatchFuncOp = createRegionFunction(
-      regionOp.getLoc(), dispatchFuncName, regionOp.dispatch());
-  std::string invocationFuncName = namePrefix + "_invocation";
-  auto invocationFuncOp = createRegionFunction(
-      regionOp.getLoc(), invocationFuncName, regionOp.invocation());
-
-  // Create the executable with the region cloned into it.
-  auto executableOp = createExecutable(
-      regionOp.getLoc(), namePrefix, {dispatchFuncOp, invocationFuncOp},
-      parentFuncOp.getParentOfType<ModuleOp>(), dispatchableFuncOps);
-  executableOp.getOperation()->moveBefore(parentFuncOp);
-
-  // Add dispatch export pointing at the function.
-  OpBuilder builder(executableOp.body());
-  auto entryPointOp = builder.create<WindowedReductionEntryOp>(
-      regionOp.getLoc(), builder.getStringAttr(dispatchFuncOp.getName()),
-      builder.getSymbolRefAttr(dispatchFuncOp),
-      builder.getSymbolRefAttr(invocationFuncOp),
-      builder.getI32IntegerAttr(windowDimension),
-      builder.getI32IntegerAttr(windowStride),
-      builder.getI32IntegerAttr(baseDilation),
-      builder.getI32IntegerAttr(windowDilation),
-      builder.getI32IntegerAttr(
-          static_cast<uint32_t>(regionOp.padding_mode())));
-
-  return {executableOp, entryPointOp};
-}
-
-// Outlines a windowed reduction region into one or more executables.
-// This separates the reduction into multiple dispatches, one for each reduction
-// dimension (thankfully XLA's operation semantics state this is ok). We then
-// special case the first dispatch such that it takes the constant initial
-// values so that we don't have to materialize a buffer for them.
-LogicalResult outlineWindowedReductionRegion(
-    WindowedReductionRegionOp regionOp, int outlinedRegionOrdinal,
-    llvm::StringMap<FuncOp> &dispatchableFuncOps) {
-  // Insert at the same place as the original region.
-  OpBuilder dispatcherBuilder(regionOp);
-
-  SmallVector<Value, 4> initialValues{regionOp.initial_values()};
-  SmallVector<Value, 4> temps{regionOp.operands()};
-
-  // Create one dispatch per dimension being reduced.
-  // We'll do this by chaining the original input through with the temporary
-  // reduction results. The results we end up with will be the originally
-  // requested shape and we can just substitute them.
-  using WindowTuple = std::tuple<int32_t, int32_t, int32_t, int32_t>;
-
-  auto windowDimensions = regionOp.window_dimensions();
-  auto windowStrides = regionOp.window_strides();
-  auto baseDilations = regionOp.base_dilations();
-  auto windowDilations = regionOp.window_dilations();
-  SmallVector<WindowTuple, 4> sortedWindowAttrs;
-  for (uint32_t i = 0; i < windowDimensions.getNumElements(); ++i) {
-    int32_t windowDimension =
-        windowDimensions.getValue<IntegerAttr>({i}).getInt();
-    int32_t windowStride = windowStrides.getValue<IntegerAttr>({i}).getInt();
-    int32_t baseDilation = baseDilations.getValue<IntegerAttr>({i}).getInt();
-    int32_t windowDilation =
-        windowDilations.getValue<IntegerAttr>({i}).getInt();
-    sortedWindowAttrs.push_back(WindowTuple(windowDimension, windowStride,
-                                            baseDilation, windowDilation));
-  }
-  llvm::sort(sortedWindowAttrs, [](WindowTuple a, WindowTuple b) {
-    return std::get<0>(a) - std::get<0>(b);
-  });
-  for (auto windowAttrs : llvm::enumerate(sortedWindowAttrs)) {
-    int32_t windowDimension = std::get<0>(windowAttrs.value());
-    int32_t windowStride = std::get<1>(windowAttrs.value());
-    int32_t baseDilation = std::get<2>(windowAttrs.value());
-    int32_t windowDilation = std::get<3>(windowAttrs.value());
-    ExecutableOp executableOp;
-    WindowedReductionEntryOp entryPointOp;
-    std::tie(executableOp, entryPointOp) = createWindowedReductionExecutable(
-        regionOp, outlinedRegionOrdinal, windowAttrs.index(), windowDimension,
-        windowStride, baseDilation, windowDilation, initialValues, temps,
-        dispatchableFuncOps);
-    temps = convertToDispatchOp(regionOp, executableOp, entryPointOp.getName(),
-                                windowDimension, initialValues,
-                                std::move(temps), dispatcherBuilder);
-    if (temps.empty()) {
-      return regionOp.emitOpError()
-             << "failed to construct reduction for windowed dimension "
-             << windowDimension;
-    }
-  }
-
-  // Replace uses of the existing results with the new results.
-  for (int i = 0; i < regionOp.getNumResults(); ++i) {
-    regionOp.getResult(i).replaceAllUsesWith(temps[i]);
-  }
-
-  // Erase original region.
-  regionOp.erase();
-
-  return success();
-}
-
-}  // namespace
-
-class OutlineReductionRegionsPass
-    : public ModulePass<OutlineReductionRegionsPass> {
- public:
-  OutlineReductionRegionsPass() = default;
-  explicit OutlineReductionRegionsPass(
-      std::shared_ptr<llvm::StringMap<FuncOp>> dispatchableFuncOps)
-      : dispatchableFuncOps_(std::move(dispatchableFuncOps)) {}
-
-  void runOnModule() override {
-    // TODO(benvanik): replace with a pattern rewriter?
-    auto funcOps = llvm::to_vector<32>(getModule().getOps<FuncOp>());
-    for (auto funcOp : funcOps) {
-      SmallVector<ReductionRegionOp, 4> reductionRegionOps;
-      funcOp.walk(
-          [&](ReductionRegionOp op) { reductionRegionOps.push_back(op); });
-      for (int i = 0; i < reductionRegionOps.size(); ++i) {
-        if (failed(outlineReductionRegion(reductionRegionOps[i], i,
-                                          *dispatchableFuncOps_))) {
-          return signalPassFailure();
-        }
-      }
-      SmallVector<WindowedReductionRegionOp, 4> windowedReductionRegionOps;
-      funcOp.walk([&](WindowedReductionRegionOp op) {
-        windowedReductionRegionOps.push_back(op);
-      });
-      for (int i = 0; i < windowedReductionRegionOps.size(); ++i) {
-        if (failed(outlineWindowedReductionRegion(windowedReductionRegionOps[i],
-                                                  i, *dispatchableFuncOps_))) {
-          return signalPassFailure();
-        }
-      }
-    }
-  }
-
- private:
-  std::shared_ptr<llvm::StringMap<FuncOp>> dispatchableFuncOps_;
-};
-
-std::unique_ptr<OpPassBase<ModuleOp>> createOutlineReductionRegionsPass(
-    std::shared_ptr<llvm::StringMap<FuncOp>> dispatchableFuncOps) {
-  return std::make_unique<OutlineReductionRegionsPass>(
-      std::move(dispatchableFuncOps));  // NOLINT
-}
-
-static PassRegistration<OutlineReductionRegionsPass> pass(
-    "iree-flow-outline-reduction-regions",
-    "Outlines reduction regions into standalone functions");
-
-}  // namespace Flow
-}  // namespace IREE
-}  // namespace iree_compiler
-}  // namespace mlir
diff --git a/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
index 872c572..91a2e69 100644
--- a/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
+++ b/iree/compiler/Dialect/Flow/Transforms/Passes.cpp
@@ -25,11 +25,6 @@
 namespace IREE {
 namespace Flow {
 
-static llvm::cl::opt<bool> experimentalDispatchReduce{
-    "iree-flow-experimental-dispatch-reduce",
-    llvm::cl::desc("Enables reductions within dispatch regions"),
-    llvm::cl::init(false)};
-
 void buildFlowTransformPassPipeline(OpPassManager &passManager) {
   passManager.addPass(createCanonicalizerPass());
 
@@ -53,16 +48,8 @@
   passManager.addNestedPass<FuncOp>(
       IREE::Flow::createPrePartitioningConversionPass());
 
-  if (experimentalDispatchReduce) {
-    // Unroll multi-dimensional reductions to one reduction per dimension.
-    passManager.addNestedPass<FuncOp>(IREE::Flow::createUnrollReductionsPass());
-  } else {
-    // Find reduction ops and create flow.reduction.regions. We do this prior to
-    // performing dispatch region identification so that we can build as big of
-    // fused reduction regions as possible. The remaining ops will be put into
-    // dispatch regions.
-    passManager.addPass(IREE::Flow::createIdentifyReductionRegionsPass());
-  }
+  // Unroll multi-dimensional reductions to one reduction per dimension.
+  passManager.addNestedPass<FuncOp>(IREE::Flow::createUnrollReductionsPass());
   passManager.addNestedPass<FuncOp>(createCSEPass());
 
   // First perform module-level analysis that following passes will use to query
@@ -87,8 +74,6 @@
   // sequencer functions performing dispatches from the dispatchees.
   passManager.addPass(
       IREE::Flow::createOutlineDispatchRegionsPass(dispatchableFuncOps));
-  passManager.addPass(
-      IREE::Flow::createOutlineReductionRegionsPass(dispatchableFuncOps));
 
   // Cleanup identity ops that clutter up the IR and canonicalize.
   passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
diff --git a/iree/compiler/Dialect/Flow/Transforms/Passes.h b/iree/compiler/Dialect/Flow/Transforms/Passes.h
index b83a600..42aaca9 100644
--- a/iree/compiler/Dialect/Flow/Transforms/Passes.h
+++ b/iree/compiler/Dialect/Flow/Transforms/Passes.h
@@ -110,17 +110,6 @@
     std::shared_ptr<llvm::StringMap<FuncOp>> dispatchableFuncOps);
 
 //===----------------------------------------------------------------------===//
-// Reductions (flow.reduction.region)
-//===----------------------------------------------------------------------===//
-
-// Identifies reduction regions and wraps them in flow.reduction_regions.
-std::unique_ptr<OpPassBase<ModuleOp>> createIdentifyReductionRegionsPass();
-
-// Outlines dispatch regions into executables.
-std::unique_ptr<OpPassBase<ModuleOp>> createOutlineReductionRegionsPass(
-    std::shared_ptr<llvm::StringMap<FuncOp>> dispatchableFuncOps);
-
-//===----------------------------------------------------------------------===//
 // Optimizations
 //===----------------------------------------------------------------------===//
 
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/assign_executable_workloads.mlir b/iree/compiler/Dialect/Flow/Transforms/test/assign_executable_workloads.mlir
index 48bcd5d..9cd1007 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/assign_executable_workloads.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/assign_executable_workloads.mlir
@@ -18,24 +18,3 @@
   %0 = flow.dispatch @singleStaticWorkload_ex_dispatch_0::@singleStaticWorkload_rgn_dispatch_0[%cst : vector<3xi32>](%arg0) : (tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
-
-// -----
-
-flow.executable @reduction_ex_reduce_0_dim_0 {
-  // CHECK-LABEL: flow.reduction.entry @reduction_rgn_reduce_0_dim_0_entry
-  // CHECK-SAME: workload = dense<[4, 1, 1]> : vector<3xi32>
-  flow.reduction.entry @reduction_rgn_reduce_0_dim_0_entry apply(@reduction_rgn_reduce_0_dim_0) attributes {dimension = 1 : i32}
-  module {
-    func @reduction_rgn_reduce_0_dim_0_entry(tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-    func @reduction_rgn_reduce_0_dim_0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-      %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-      return %0 : tensor<f32>
-    }
-  }
-}
-func @reduction(%arg0: tensor<4x8xf32>) -> tensor<4xf32> {
-  %cst = constant dense<0.000000e+00> : tensor<f32>
-  %cst_0 = constant dense<[4, 1, 1]> : vector<3xi32>
-  %0 = flow.dispatch @reduction_ex_reduce_0_dim_0::@reduction_rgn_reduce_0_dim_0_entry[%cst_0 : vector<3xi32>](%arg0, %cst) : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/identify_reduction_regions.mlir b/iree/compiler/Dialect/Flow/Transforms/test/identify_reduction_regions.mlir
deleted file mode 100644
index 8dbb335..0000000
--- a/iree/compiler/Dialect/Flow/Transforms/test/identify_reduction_regions.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: iree-opt -split-input-file -iree-flow-identify-reduction-regions %s | IreeFileCheck %s
-
-// CHECK-LABEL: @single_reduction
-func @single_reduction(%arg0 : tensor<4x8xf32>) -> tensor<4xf32> {
-  // CHECK-DAG: [[INITIAL:%.+]] = constant dense<0.000000e+00>
-  %0 = constant dense<0.000000e+00> : tensor<f32>
-  // CHECK-DAG: constant dense<[4, 1, 1]>
-  // CHECK-NEXT: [[RESULT:%.+]] = flow.reduction.region
-  // CHECK-SAME: [%cst_0 : vector<3xi32>]
-  // CHECK-SAME: (%arg1 = %arg0 : tensor<4x8xf32>, %arg2 = [[INITIAL]] : tensor<f32>) -> tensor<4xf32>
-  // CHECK-NEXT: = "xla_hlo.reduce"(%arg1, %arg2)
-  %1 = "xla_hlo.reduce"(%arg0, %0) ( {
-  // CHECK: invocation((%arg1, %arg2) : tensor<f32>) -> tensor<f32> {
-  ^bb0(%arg1 : tensor<f32>, %arg2 : tensor<f32>):
-    // CHECK-NEXT: %1 = xla_hlo.add %arg1, %arg2 : tensor<f32>
-    %2 = xla_hlo.add %arg1, %arg2 : tensor<f32>
-    // CHECK-NEXT: flow.return %1 : tensor<f32>
-    "xla_hlo.return"(%2) : (tensor<f32>) -> ()
-  // CHECK-NEXT: } {dimensions = dense<1> : vector<1xi32>}
-  }) {dimensions = dense<[1]> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-  // CHECK: return [[RESULT]] : tensor<4xf32>
-  return %1 : tensor<4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @multi_reduction
-func @multi_reduction(%arg0 : tensor<4x8xf32>, %arg1 : tensor<4x8xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
-  // CHECK-DAG: [[INITIALA:%.+]] = constant dense<0.000000e+00>
-  %0 = constant dense<0.000000e+00> : tensor<f32>
-  // CHECK-DAG: [[INITIALB:%.+]] = constant dense<1.000000e+00>
-  %1 = constant dense<1.000000e+00> : tensor<f32>
-  // CHECK: constant dense<[4, 1, 1]>
-  // CHECK-NEXT: [[RESULT:%.+]]:2 = flow.reduction.region
-  // CHECK-SAME: [%cst_1 : vector<3xi32>]
-  // CHECK-SAME: (%arg2 = %arg0 : tensor<4x8xf32>, %arg3 = %arg1 : tensor<4x8xf32>, %arg4 = [[INITIALA]] : tensor<f32>, %arg5 = [[INITIALB]] : tensor<f32>) -> (tensor<4xf32>, tensor<4xf32>)
-  // CHECK-NEXT: = "xla_hlo.reduce"(%arg2, %arg3, %arg4, %arg5)
-  %2, %3 = "xla_hlo.reduce"(%arg0, %arg1, %0, %1) ( {
-  // CHECK: invocation((%arg2, %arg3) : tensor<f32>, (%arg4, %arg5) : tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-  ^bb0(%arg0_lhs : tensor<f32>, %arg1_lhs : tensor<f32>, %arg0_rhs : tensor<f32>, %arg1_rhs : tensor<f32>):
-    // CHECK-NEXT: %1 = xla_hlo.add %arg2, %arg4 : tensor<f32>
-    %4 = xla_hlo.add %arg0_lhs, %arg0_rhs : tensor<f32>
-    // CHECK-NEXT: %2 = xla_hlo.add %arg3, %arg5 : tensor<f32>
-    %5 = xla_hlo.add %arg1_lhs, %arg1_rhs : tensor<f32>
-    // CHECK-NEXT: flow.return %1, %2 : tensor<f32>, tensor<f32>
-    "xla_hlo.return"(%4, %5) : (tensor<f32>, tensor<f32>) -> ()
-  // CHECK-NEXT: } {dimensions = dense<1> : vector<1xi32>}
-  }) {dimensions = dense<[1]> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<4x8xf32>, tensor<f32>, tensor<f32>) -> (tensor<4xf32>, tensor<4xf32>)
-  // CHECK: return [[RESULT]]#0, [[RESULT]]#1 : tensor<4xf32>, tensor<4xf32>
-  return %2, %3 : tensor<4xf32>, tensor<4xf32>
-}
-
-// -----
-
-// TODO(benvanik): windowed reduction.
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/outline_reduction_regions.mlir b/iree/compiler/Dialect/Flow/Transforms/test/outline_reduction_regions.mlir
deleted file mode 100644
index f05e169..0000000
--- a/iree/compiler/Dialect/Flow/Transforms/test/outline_reduction_regions.mlir
+++ /dev/null
@@ -1,123 +0,0 @@
-// RUN: iree-opt -split-input-file -iree-flow-outline-reduction-regions -cse %s | IreeFileCheck %s
-
-func @single_reduction(%arg0: tensor<4x8xf32>) -> tensor<4xf32> {
-  %cst = constant dense<0.000000e+00> : tensor<f32>
-  %cst_0 = constant dense<[4, 1, 1]> : vector<3xi32>
-  %0 = flow.reduction.region[%cst_0 : vector<3xi32>](%arg1 = %arg0 : tensor<4x8xf32>, %arg2 = %cst : tensor<f32>) -> tensor<4xf32> {
-    %1 = "xla_hlo.reduce"(%arg1, %arg2) ( {
-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>): // no predecessors
-      %2 = xla_hlo.add %arg3, %arg4 : tensor<f32>
-      "xla_hlo.return"(%2) : (tensor<f32>) -> ()
-    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-    flow.return %1 : tensor<4xf32>
-  } invocation((%arg1, %arg2) : tensor<f32>) -> tensor<f32> {
-    %1 = xla_hlo.add %arg1, %arg2 : tensor<f32>
-    flow.return %1 : tensor<f32>
-  } {dimensions = dense<1> : vector<1xi32>}
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: flow.executable @single_reduction_reduce_0_dim_0 {
-//  CHECK-NEXT:   flow.reduction.entry @single_reduction_reduce_0_dim_0_dispatch apply(@single_reduction_reduce_0_dim_0_invocation) attributes {dimension = 1 : i32}
-//  CHECK-NEXT:   module {
-//  CHECK-NEXT:     func @single_reduction_reduce_0_dim_0_dispatch(tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-//  CHECK-NEXT:     func @single_reduction_reduce_0_dim_0_invocation(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-//  CHECK-NEXT:       %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-//  CHECK-NEXT:       return %0 : tensor<f32>
-//  CHECK-NEXT:     }
-//  CHECK-NEXT:   }
-//  CHECK-NEXT: }
-//  CHECK-NEXT: func @single_reduction(%arg0: tensor<4x8xf32>) -> tensor<4xf32> {
-//   CHECK-DAG:   %cst = constant dense<0.000000e+00> : tensor<f32>
-//   CHECK-DAG:   %cst_0 = constant dense<[4, 1, 1]> : vector<3xi32>
-//  CHECK-NEXT:   %0 = flow.dispatch @single_reduction_reduce_0_dim_0::@single_reduction_reduce_0_dim_0_dispatch[%cst_0 : vector<3xi32>](%arg0, %cst) : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-//  CHECK-NEXT:   return %0 : tensor<4xf32>
-//  CHECK-NEXT: }
-
-// -----
-
-func @unrolled_reduction(%arg0: tensor<4x2x8xf32>) -> tensor<4xf32> {
-  %cst = constant dense<0.000000e+00> : tensor<f32>
-  %cst_0 = constant dense<[4, 1, 1]> : vector<3xi32>
-  %0 = flow.reduction.region[%cst_0 : vector<3xi32>](%arg1 = %arg0 : tensor<4x2x8xf32>, %arg2 = %cst : tensor<f32>) -> tensor<4xf32> {
-    %1 = "xla_hlo.reduce"(%arg1, %arg2) ( {
-    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>): // no predecessors
-      %2 = xla_hlo.add %arg3, %arg4 : tensor<f32>
-      "xla_hlo.return"(%2) : (tensor<f32>) -> ()
-    }) {dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<4x2x8xf32>, tensor<f32>) -> tensor<4xf32>
-    flow.return %1 : tensor<4xf32>
-  } invocation((%arg1, %arg2) : tensor<f32>) -> tensor<f32> {
-    %1 = xla_hlo.add %arg1, %arg2 : tensor<f32>
-    flow.return %1 : tensor<f32>
-  } {dimensions = dense<[1, 2]> : vector<2xi32>}
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: flow.executable @unrolled_reduction_reduce_0_dim_0 {
-//  CHECK-NEXT:   flow.reduction.entry @unrolled_reduction_reduce_0_dim_0_dispatch apply(@unrolled_reduction_reduce_0_dim_0_invocation) attributes {dimension = 2 : i32}
-//  CHECK-NEXT:   module {
-//  CHECK-NEXT:     func @unrolled_reduction_reduce_0_dim_0_dispatch(tensor<4x2x8xf32>, tensor<f32>) -> tensor<4x2xf32>
-//  CHECK-NEXT:     func @unrolled_reduction_reduce_0_dim_0_invocation(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-//  CHECK-NEXT:       %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-//  CHECK-NEXT:       return %0 : tensor<f32>
-//  CHECK-NEXT:     }
-//  CHECK-NEXT:   }
-//  CHECK-NEXT: }
-//  CHECK-NEXT: flow.executable @unrolled_reduction_reduce_0_dim_1 {
-//  CHECK-NEXT:   flow.reduction.entry @unrolled_reduction_reduce_0_dim_1_dispatch apply(@unrolled_reduction_reduce_0_dim_1_invocation) attributes {dimension = 1 : i32}
-//  CHECK-NEXT:   module {
-//  CHECK-NEXT:     func @unrolled_reduction_reduce_0_dim_1_dispatch(tensor<4x2xf32>, tensor<f32>) -> tensor<4xf32>
-//  CHECK-NEXT:     func @unrolled_reduction_reduce_0_dim_1_invocation(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-//  CHECK-NEXT:       %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-//  CHECK-NEXT:       return %0 : tensor<f32>
-//  CHECK-NEXT:     }
-//  CHECK-NEXT:   }
-//  CHECK-NEXT: }
-//  CHECK-NEXT: func @unrolled_reduction(%arg0: tensor<4x2x8xf32>) -> tensor<4xf32> {
-//   CHECK-DAG:   %cst = constant dense<0.000000e+00> : tensor<f32>
-//   CHECK-DAG:   %cst_0 = constant dense<[4, 1, 1]> : vector<3xi32>
-//  CHECK-NEXT:   %0 = flow.dispatch @unrolled_reduction_reduce_0_dim_0::@unrolled_reduction_reduce_0_dim_0_dispatch[%cst_0 : vector<3xi32>](%arg0, %cst) : (tensor<4x2x8xf32>, tensor<f32>) -> tensor<4x2xf32>
-//  CHECK-NEXT:   %1 = flow.dispatch @unrolled_reduction_reduce_0_dim_1::@unrolled_reduction_reduce_0_dim_1_dispatch[%cst_0 : vector<3xi32>](%0, %cst) : (tensor<4x2xf32>, tensor<f32>) -> tensor<4xf32>
-//  CHECK-NEXT:   return %1 : tensor<4xf32>
-//  CHECK-NEXT: }
-
-// -----
-
-func @multi_reduction(%arg0: tensor<4x8xf32>, %arg1: tensor<4x8xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
-  %cst = constant dense<0.000000e+00> : tensor<f32>
-  %cst_0 = constant dense<1.000000e+00> : tensor<f32>
-  %cst_1 = constant dense<[4, 1, 1]> : vector<3xi32>
-  %0:2 = flow.reduction.region[%cst_1 : vector<3xi32>](%arg2 = %arg0 : tensor<4x8xf32>, %arg3 = %arg1 : tensor<4x8xf32>, %arg4 = %cst : tensor<f32>, %arg5 = %cst_0 : tensor<f32>) -> (tensor<4xf32>, tensor<4xf32>) {
-    %1:2 = "xla_hlo.reduce"(%arg2, %arg3, %arg4, %arg5) ( {
-    ^bb0(%arg6: tensor<f32>, %arg7: tensor<f32>, %arg8: tensor<f32>, %arg9: tensor<f32>): // no predecessors
-      %2 = xla_hlo.add %arg6, %arg8 : tensor<f32>
-      %3 = xla_hlo.add %arg7, %arg9 : tensor<f32>
-      "xla_hlo.return"(%2, %3) : (tensor<f32>, tensor<f32>) -> ()
-    }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<4x8xf32>, tensor<f32>, tensor<f32>) -> (tensor<4xf32>, tensor<4xf32>)
-    flow.return %1#0, %1#1 : tensor<4xf32>, tensor<4xf32>
-  } invocation((%arg2, %arg3) : tensor<f32>, (%arg4, %arg5) : tensor<f32>) -> (tensor<f32>, tensor<i32>) {
-    %1 = xla_hlo.add %arg2, %arg4 : tensor<f32>
-    %2 = xla_hlo.add %arg3, %arg5 : tensor<f32>
-    flow.return %1, %2 : tensor<f32>, tensor<f32>
-  } {dimensions = dense<1> : vector<1xi32>}
-  return %0#0, %0#1 : tensor<4xf32>, tensor<4xf32>
-}
-
-// CHECK-LABEL: flow.executable @multi_reduction_reduce_0_dim_0 {
-//  CHECK-NEXT:   flow.reduction.entry @multi_reduction_reduce_0_dim_0_dispatch apply(@multi_reduction_reduce_0_dim_0_invocation) attributes {dimension = 1 : i32}
-//  CHECK-NEXT:   module {
-//  CHECK-NEXT:     func @multi_reduction_reduce_0_dim_0_dispatch(tensor<4x8xf32>, tensor<4x8xf32>, tensor<f32>, tensor<f32>) -> (tensor<4xf32>, tensor<4xf32>)
-//  CHECK-NEXT:     func @multi_reduction_reduce_0_dim_0_invocation(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
-//  CHECK-NEXT:       %0 = xla_hlo.add %arg0, %arg2 : tensor<f32>
-//  CHECK-NEXT:       %1 = xla_hlo.add %arg1, %arg3 : tensor<f32>
-//  CHECK-NEXT:       return %0, %1 : tensor<f32>, tensor<f32>
-//  CHECK-NEXT:     }
-//  CHECK-NEXT:   }
-//  CHECK-NEXT: }
-//  CHECK-NEXT: func @multi_reduction(%arg0: tensor<4x8xf32>, %arg1: tensor<4x8xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
-//   CHECK-DAG:   %cst = constant dense<0.000000e+00> : tensor<f32>
-//   CHECK-DAG:   %cst_0 = constant dense<1.000000e+00> : tensor<f32>
-//   CHECK-DAG:   %cst_1 = constant dense<[4, 1, 1]> : vector<3xi32>
-//  CHECK-NEXT:   %0:2 = flow.dispatch @multi_reduction_reduce_0_dim_0::@multi_reduction_reduce_0_dim_0_dispatch[%cst_1 : vector<3xi32>](%arg0, %arg1, %cst, %cst_0) : (tensor<4x8xf32>, tensor<4x8xf32>, tensor<f32>, tensor<f32>) -> (tensor<4xf32>, tensor<4xf32>)
-//  CHECK-NEXT:   return %0#0, %0#1 : tensor<4xf32>, tensor<4xf32>
-//  CHECK-NEXT: }
diff --git a/iree/compiler/Dialect/Flow/Transforms/test/transformation.mlir b/iree/compiler/Dialect/Flow/Transforms/test/transformation.mlir
index 5e8eb70..4b1dee9 100644
--- a/iree/compiler/Dialect/Flow/Transforms/test/transformation.mlir
+++ b/iree/compiler/Dialect/Flow/Transforms/test/transformation.mlir
@@ -160,28 +160,28 @@
   return %1 : tensor<4xf32>
 }
 
-// CHECK-LABEL: flow.executable @reduction_reduce_0_dim_0 {
-// CHECK-NEXT:   flow.reduction.entry @reduction_reduce_0_dim_0_dispatch apply(@reduction_reduce_0_dim_0_invocation) attributes {
-// CHECK-SAME:     dimension = 1 : i32,
-// CHECK-SAME:     workload = dense<[4, 1, 1]>
-// CHECK-SAME:   }
-// CHECK-NEXT:   module {
-// CHECK-NEXT:     func @reduction_reduce_0_dim_0_dispatch(tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-// CHECK-NEXT:     func @reduction_reduce_0_dim_0_invocation(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-// CHECK-NEXT:       %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-// CHECK-NEXT:       return %0 : tensor<f32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
-// CHECK-NEXT: func @reduction(%arg0: tensor<4x8xf32>) -> tensor<4xf32> {
-// CHECK-NEXT:   %cst = constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:   %cst_0 = constant dense<[4, 1, 1]> : vector<3xi32>
-// CHECK-NEXT:   %0 = flow.ex.stream.fragment(%arg1 = %cst_0 : vector<3xi32>, %arg2 = %arg0 : tensor<4x8xf32>, %arg3 = %cst : tensor<f32>) -> tensor<4xf32> {
-// CHECK-NEXT:     %1 = flow.dispatch @reduction_reduce_0_dim_0::@reduction_reduce_0_dim_0_dispatch[%arg1 : vector<3xi32>](%arg2, %arg3) : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-// CHECK-NEXT:     flow.return %1 : tensor<4xf32>
-// CHECK-NEXT:   }
-// CHECK-NEXT:   return %0 : tensor<4xf32>
-// CHECK-NEXT: }
+// CHECK-LABEL: flow.executable @reduction_ex_dispatch_0 {
+//  CHECK-NEXT:   flow.dispatch.entry @reduction_ex_dispatch_0 attributes {workload = dense<[4, 1, 1]> : vector<3xi32>}
+//  CHECK-NEXT:   module {
+//  CHECK-NEXT:     func @reduction_ex_dispatch_0(%arg0: tensor<4x8xf32>) -> tensor<4xf32> {
+//  CHECK-NEXT:       %cst = constant dense<0.000000e+00> : tensor<f32>
+//  CHECK-NEXT:       %0 = "xla_hlo.reduce"(%arg0, %cst) ( {
+//  CHECK-NEXT:       ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>): // no predecessors
+//  CHECK-NEXT:         %1 = xla_hlo.add %arg1, %arg2 : tensor<f32>
+//  CHECK-NEXT:         "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+//  CHECK-NEXT:       }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
+//  CHECK-NEXT:       return %0 : tensor<4xf32>
+//  CHECK-NEXT:     }
+//  CHECK-NEXT:   }
+//  CHECK-NEXT: }
+//  CHECK-NEXT: func @reduction(%arg0: tensor<4x8xf32>) -> tensor<4xf32> {
+//  CHECK-NEXT:   %cst = constant dense<[4, 1, 1]> : vector<3xi32>
+//  CHECK-NEXT:   %0 = flow.ex.stream.fragment(%arg1 = %cst : vector<3xi32>, %arg2 = %arg0 : tensor<4x8xf32>) -> tensor<4xf32> {
+//  CHECK-NEXT:     %1 = flow.dispatch @reduction_ex_dispatch_0::@reduction_ex_dispatch_0[%arg1 : vector<3xi32>](%arg2) : (tensor<4x8xf32>) -> tensor<4xf32>
+//  CHECK-NEXT:     flow.return %1 : tensor<4xf32>
+//  CHECK-NEXT:   }
+//  CHECK-NEXT:   return %0 : tensor<4xf32>
+//  CHECK-NEXT: }
 
 // -----
 
diff --git a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMTarget.cpp b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMTarget.cpp
index 4ed3f3b..7a2a93f 100644
--- a/iree/compiler/Dialect/HAL/Target/LLVM/LLVMTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/LLVM/LLVMTarget.cpp
@@ -49,8 +49,6 @@
   for (auto& op : executableOp.getBlock().getOperations()) {
     if (auto entryOp = dyn_cast<IREE::Flow::DispatchEntryOp>(op)) {
       entryPointNames.push_back(std::string(entryOp.function_ref()));
-    } else if (auto entryOp = dyn_cast<IREE::Flow::ReductionEntryOp>(op)) {
-      entryPointNames.push_back(std::string(entryOp.function_ref()));
     }
   }
   return entryPointNames;
diff --git a/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp b/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
index e726c9e..9e5af90 100644
--- a/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
+++ b/iree/compiler/Dialect/HAL/Target/VulkanSPIRV/VulkanSPIRVTarget.cpp
@@ -102,8 +102,6 @@
   for (auto &op : executableOp.getBlock().getOperations()) {
     if (auto entryOp = dyn_cast<IREE::Flow::DispatchEntryOp>(op)) {
       entryPointNames.push_back(std::string(entryOp.function_ref()));
-    } else if (auto entryOp = dyn_cast<IREE::Flow::ReductionEntryOp>(op)) {
-      entryPointNames.push_back(std::string(entryOp.function_ref()));
     }
   }
   return entryPointNames;
@@ -214,14 +212,6 @@
       auto workGroupSize = funcOp.getAttrOfType<DenseIntElementsAttr>(
           "iree.executable.workgroup_size");
       targetEntryOp.setAttr("workgroup_size", workGroupSize);
-    } else if (auto entryOp = dyn_cast<IREE::Flow::ReductionEntryOp>(&op)) {
-      auto targetEntryOp =
-          executableOp.lookupSymbol<IREE::HAL::ExecutableEntryPointOp>(
-              entryOp.sym_name());
-      auto funcOp = moduleOp.lookupSymbol<FuncOp>(entryOp.function_ref());
-      auto workGroupSize = funcOp.getAttrOfType<DenseIntElementsAttr>(
-          "iree.executable.workgroup_size");
-      targetEntryOp.setAttr("workgroup_size", workGroupSize);
     }
   }
 }
diff --git a/iree/compiler/Dialect/HAL/Target/test/smoketest.mlir b/iree/compiler/Dialect/HAL/Target/test/smoketest.mlir
index c647802..d9b2169 100644
--- a/iree/compiler/Dialect/HAL/Target/test/smoketest.mlir
+++ b/iree/compiler/Dialect/HAL/Target/test/smoketest.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt -split-input-file -iree-hal-transformation-pipeline -iree-hal-target-backends=vulkan-spirv -iree-flow-experimental-dispatch-reduce %s | IreeFileCheck %s -check-prefix=VKSPV
+// RUN: iree-opt -split-input-file -iree-hal-transformation-pipeline -iree-hal-target-backends=vulkan-spirv %s | IreeFileCheck %s -check-prefix=VKSPV
 
 flow.executable @simpleMath_ex_dispatch_0 {
   flow.dispatch.entry @simpleMath_rgn_dispatch_0 attributes {
diff --git a/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp b/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
index c3fcb2c..b837b4d 100644
--- a/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
+++ b/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
@@ -51,12 +51,6 @@
           moduleOp.lookupSymbol<FuncOp>(dispatchEntryOp.function_ref());
       entryFuncOps.push_back(funcOp);
       entryLocs.push_back(dispatchEntryOp.getLoc());
-    } else if (auto reductionEntryOp =
-                   dyn_cast<IREE::Flow::ReductionEntryOp>(op)) {
-      auto funcOp =
-          moduleOp.lookupSymbol<FuncOp>(reductionEntryOp.function_ref());
-      entryFuncOps.push_back(funcOp);
-      entryLocs.push_back(reductionEntryOp.getLoc());
     }
   }
   auto interfaceLoc = executableBuilder.getFusedLoc(entryLocs);
@@ -208,26 +202,6 @@
           builder.getI32IntegerAttr(nextOrdinal++), workGroupSizeAttr,
           builder.getSymbolRefAttr(interfaceOp),
           TypeAttr::get(sourceFuncOp.getType()));
-    } else if (auto reductionEntryOp =
-                   dyn_cast<IREE::Flow::ReductionEntryOp>(op)) {
-      auto sourceFuncOp = sourceOp.getInnerModule().lookupSymbol<FuncOp>(
-          reductionEntryOp.function_ref());
-      auto thunkFuncOp = createDispatchEntryThunk(sourceFuncOp, interfaceOp);
-      if (!thunkFuncOp.hasValue()) {
-        return failure();
-      }
-      reductionEntryOp.setAttr(
-          "function_ref", builder.getSymbolRefAttr(thunkFuncOp.getValue()));
-
-      builder.create<IREE::HAL::ExecutableEntryPointOp>(
-          reductionEntryOp.getLoc(),
-          builder.getStringAttr(thunkFuncOp->getName()),
-          builder.getI32IntegerAttr(nextOrdinal++),
-          DenseIntElementsAttr::get(
-              VectorType::get({3}, builder.getIntegerType(32)),
-              ArrayRef<int32_t>{1, 1, 1}),
-          builder.getSymbolRefAttr(interfaceOp),
-          TypeAttr::get(sourceFuncOp.getType()));
     }
   }
   return success();
diff --git a/iree/compiler/Dialect/HAL/Transforms/RewriteLegacyIO.cpp b/iree/compiler/Dialect/HAL/Transforms/RewriteLegacyIO.cpp
index aaf942a..e54525c 100644
--- a/iree/compiler/Dialect/HAL/Transforms/RewriteLegacyIO.cpp
+++ b/iree/compiler/Dialect/HAL/Transforms/RewriteLegacyIO.cpp
@@ -92,46 +92,6 @@
   for (auto *deadOp : deadOps) deadOp->erase();
 }
 
-// TODO(b/150312935): remove this when the SPIR-V and LLVM targets use
-// hal.interface.
-static void makeLegacyExecutableReductionABI(
-    IREE::Flow::ReductionEntryOp reductionEntryOp, FuncOp thunkOp) {
-  auto *context = thunkOp.getContext();
-
-  auto implOp = thunkOp.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(
-      (thunkOp.getName() + "_impl").str());
-  implOp.setAttr("iree.executable.export", UnitAttr::get(context));
-  implOp.setAttr("iree.executable.reduction", UnitAttr::get(context));
-  implOp.setAttr("iree.executable.reduction.apply",
-                 FlatSymbolRefAttr::get(reductionEntryOp.apply_ref(), context));
-  implOp.setAttr("iree.executable.reduction.dimension",
-                 IntegerAttr::get(IntegerType::get(32, context),
-                                  reductionEntryOp.dimension()));
-
-  // Remove any blocks that may exist within the implementation function as the
-  // backend will be replacing the body with its own implementation.
-  implOp.getBlocks().clear();
-
-  // Destroy the IO op and replace with the original entry.
-  SymbolTable::setSymbolVisibility(implOp, SymbolTable::Visibility::Public);
-  auto originalName = thunkOp.getName();
-  thunkOp.erase();
-  implOp.setName(originalName);
-
-  // Reset function type to memrefs with output args.
-  SmallVector<Type, 4> inputTypes;
-  for (const auto &oldType : implOp.getType().getInputs()) {
-    inputTypes.push_back(
-        convertLegacyTypeToMemRef(legalizeLegacyType(oldType)));
-  }
-  for (const auto &oldType : implOp.getType().getResults()) {
-    inputTypes.push_back(
-        convertLegacyTypeToMemRef(legalizeLegacyType(oldType)));
-  }
-  auto funcType = FunctionType::get(inputTypes, {}, context);
-  implOp.setType(funcType);
-}
-
 class RewriteLegacyIOPass
     : public OperationPass<RewriteLegacyIOPass, IREE::Flow::ExecutableOp> {
  public:
@@ -144,9 +104,6 @@
       if (auto entryOp = dyn_cast<IREE::Flow::DispatchEntryOp>(&op)) {
         auto thunkOp = moduleOp.lookupSymbol<FuncOp>(entryOp.function_ref());
         makeLegacyExecutableDispatchABI(entryOp, thunkOp);
-      } else if (auto entryOp = dyn_cast<IREE::Flow::ReductionEntryOp>(&op)) {
-        auto thunkOp = moduleOp.lookupSymbol<FuncOp>(entryOp.function_ref());
-        makeLegacyExecutableReductionABI(entryOp, thunkOp);
       }
     }
 
diff --git a/iree/compiler/Dialect/HAL/Transforms/test/materialize_interfaces.mlir b/iree/compiler/Dialect/HAL/Transforms/test/materialize_interfaces.mlir
index aab38bb..8fa24fd 100644
--- a/iree/compiler/Dialect/HAL/Transforms/test/materialize_interfaces.mlir
+++ b/iree/compiler/Dialect/HAL/Transforms/test/materialize_interfaces.mlir
@@ -35,46 +35,3 @@
     // CHECK: hal.interface @legacy_io attributes {sym_visibility = "private"}
   }
 }
-
-// -----
-
-// CHECK-LABEL: hal.executable @reduction_ex_reduce_0_dim_0
-// CHECK-DAG: hal.interface @legacy_io {
-// CHECK-NEXT:  hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-// CHECK-NEXT:  hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-// CHECK-NEXT:  hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-// CHECK-NEXT: }
-// CHECK-DAG: hal.executable.entry_point @reduction_rgn_reduce_0_dim_0_entry attributes {
-// CHECK-SAME:  interface = @legacy_io,
-// CHECK-SAME:  ordinal = 0 : i32,
-// CHECK-SAME:  signature = (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>,
-// CHECK-SAME:  workgroup_size = dense<1> : vector<3xi32>
-// CHECK-SAME: }
-// CHECK-DAG: hal.executable.source {
-// CHECK-NEXT: module {
-// CHECK-NEXT: flow.executable @reduction_ex_reduce_0_dim_0
-flow.executable @reduction_ex_reduce_0_dim_0 {
-  flow.reduction.entry @reduction_rgn_reduce_0_dim_0_entry apply(@reduction_rgn_reduce_0_dim_0) attributes {
-    dimension = 1 : i32,
-    workgroup_size = dense<[32, 1, 1]> : vector<3xi32>,
-    workload = dense<[4, 1, 1]> : vector<3xi32>
-  }
-  // CHECK: module {
-  module {
-    // CHECK-NEXT: func @reduction_rgn_reduce_0_dim_0_entry() {
-    // CHECK-NEXT:   [[ZERO:%.+]] = constant 0
-    // CHECK-NEXT:   [[ARG0:%.+]] = hal.interface.load.tensor @legacy_io::@arg0, offset = [[ZERO]] : tensor<4x8xf32>
-    // CHECK-NEXT:   [[ARG1:%.+]] = hal.interface.load.tensor @legacy_io::@arg1, offset = [[ZERO]] : tensor<f32>
-    // CHECK-NEXT:   [[RET0:%.+]] = call @reduction_rgn_reduce_0_dim_0_entry_impl([[ARG0]], [[ARG1]]) : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-    // CHECK-NEXT:   hal.interface.store.tensor [[RET0]], @legacy_io::@ret0, offset = [[ZERO]] : tensor<4xf32>
-    // CHECK-NEXT:   return
-    // CHECK-NEXT: }
-    // CHECK-NEXT: func @reduction_rgn_reduce_0_dim_0_entry_impl
-    func @reduction_rgn_reduce_0_dim_0_entry(tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-    func @reduction_rgn_reduce_0_dim_0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-      %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-      return %0 : tensor<f32>
-    }
-    // CHECK: hal.interface @legacy_io attributes {sym_visibility = "private"}
-  }
-}
diff --git a/iree/compiler/Dialect/HAL/Transforms/test/rewrite_legacy_io.mlir b/iree/compiler/Dialect/HAL/Transforms/test/rewrite_legacy_io.mlir
index 699ca95..a8afb0a 100644
--- a/iree/compiler/Dialect/HAL/Transforms/test/rewrite_legacy_io.mlir
+++ b/iree/compiler/Dialect/HAL/Transforms/test/rewrite_legacy_io.mlir
@@ -32,40 +32,3 @@
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
 // CHECK-NEXT: }
-
-// -----
-
-flow.executable @reduction_ex_reduce_0_dim_0 {
-  flow.reduction.entry @reduction_rgn_reduce_0_dim_0_entry apply(@reduction_rgn_reduce_0_dim_0) attributes {dimension = 1 : i32, workgroup_size = dense<[32, 1, 1]> : vector<3xi32>, workload = dense<[4, 1, 1]> : vector<3xi32>}
-  module {
-    func @reduction_rgn_reduce_0_dim_0_entry() {
-      %c0_i32 = constant 0 : i32
-      %0 = hal.interface.load.tensor @legacy_io::@arg0, offset = %c0_i32 : tensor<4x8xf32>
-      %1 = hal.interface.load.tensor @legacy_io::@arg1, offset = %c0_i32 : tensor<f32>
-      %2 = call @reduction_rgn_reduce_0_dim_0_entry_impl(%0, %1) : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-      hal.interface.store.tensor %2, @legacy_io::@ret0, offset = %c0_i32 : tensor<4xf32>
-      return
-    }
-    func @reduction_rgn_reduce_0_dim_0_entry_impl(tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32> attributes {sym_visibility = "private"}
-    func @reduction_rgn_reduce_0_dim_0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-      %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-      return %0 : tensor<f32>
-    }
-    hal.interface @legacy_io attributes {sym_visibility = "private"} {
-      hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
-      hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
-      hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
-    }
-  }
-}
-
-// CHECK-LABEL: flow.executable @reduction_ex_reduce_0_dim_0 {
-// CHECK-NEXT:   flow.reduction.entry @reduction_rgn_reduce_0_dim_0_entry apply(@reduction_rgn_reduce_0_dim_0) attributes {dimension = 1 : i32, workgroup_size = dense<[32, 1, 1]> : vector<3xi32>, workload = dense<[4, 1, 1]> : vector<3xi32>}
-// CHECK-NEXT:   module {
-// CHECK-NEXT:     func @reduction_rgn_reduce_0_dim_0_entry(memref<4x8xf32>, memref<f32>, memref<4xf32>) attributes {iree.executable.export, iree.executable.reduction, iree.executable.reduction.apply = @reduction_rgn_reduce_0_dim_0, iree.executable.reduction.dimension = 1 : i32}
-// CHECK-NEXT:     func @reduction_rgn_reduce_0_dim_0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-// CHECK-NEXT:       %0 = xla_hlo.add %arg0, %arg1 : tensor<f32>
-// CHECK-NEXT:       return %0 : tensor<f32>
-// CHECK-NEXT:     }
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/iree/hal/cts/BUILD b/iree/hal/cts/BUILD
index b5dc016..0b6ce91 100644
--- a/iree/hal/cts/BUILD
+++ b/iree/hal/cts/BUILD
@@ -35,6 +35,7 @@
         "//iree/hal:driver_registry",
 
         # HAL driver modules.
+        "//iree/hal/vmla:vmla_driver_module",  # build-cleaner: keep
         "//iree/hal/vulkan:vulkan_driver_module",  # build-cleaner: keep
         # "//iree/hal/dawn:dawn_driver_module",  # build-cleaner: keep
     ] + PLATFORM_VULKAN_TEST_DEPS,
diff --git a/test/e2e/linalg_path/reduce.mlir b/test/e2e/linalg_path/reduce.mlir
index 76e743b..21b9a80 100644
--- a/test/e2e/linalg_path/reduce.mlir
+++ b/test/e2e/linalg_path/reduce.mlir
@@ -1,4 +1,4 @@
-// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir -split-input-file -iree-hal-target-backends=vulkan-spirv -iree-flow-experimental-dispatch-reduce %s | IreeFileCheck %s)
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir -split-input-file -iree-hal-target-backends=vulkan-spirv %s | IreeFileCheck %s)
 
 // CHECK-LABEL: EXEC @reduce_dim_1
 // CHECK: 2xi32=25 50
diff --git a/test/e2e/xla/mnist.mlir b/test/e2e/xla/mnist.mlir
index baa5a0a..caf8228 100644
--- a/test/e2e/xla/mnist.mlir
+++ b/test/e2e/xla/mnist.mlir
@@ -1,7 +1,7 @@
 // MNIST model with placeholder weights, for translation testing.
 
-// RUN: iree-run-mlir -iree-hal-target-backends=vmla -iree-flow-experimental-dispatch-reduce %s -input-value="1x28x28x1xf32" | IreeFileCheck %s
-// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir -iree-hal-target-backends=vulkan-spirv -iree-flow-experimental-dispatch-reduce %s -input-value="1x28x28x1xf32" | IreeFileCheck %s)
+// RUN: iree-run-mlir -iree-hal-target-backends=vmla %s -input-value="1x28x28x1xf32" | IreeFileCheck %s
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir -iree-hal-target-backends=vulkan-spirv %s -input-value="1x28x28x1xf32" | IreeFileCheck %s)
 
 module {
   // CHECK-LABEL: EXEC @main
diff --git a/test/e2e/xla/reduce_float.mlir b/test/e2e/xla/reduce_float.mlir
index 270e882..db5813e 100644
--- a/test/e2e/xla/reduce_float.mlir
+++ b/test/e2e/xla/reduce_float.mlir
@@ -1,5 +1,5 @@
-// RUN: iree-run-mlir -iree-hal-target-backends=vmla -iree-flow-experimental-dispatch-reduce %s | IreeFileCheck %s
-// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir %s -iree-hal-target-backends=vulkan-spirv -iree-flow-experimental-dispatch-reduce )
+// RUN: iree-run-mlir -iree-hal-target-backends=vmla %s | IreeFileCheck %s
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir %s -iree-hal-target-backends=vulkan-spirv )
 
 // Float sum values from [1.0, 10.0]
 // CHECK-LABEL: EXEC @reduce_sum_1x10xf32
diff --git a/test/e2e/xla/reduce_int.mlir b/test/e2e/xla/reduce_int.mlir
index 03141dc..37bef13 100644
--- a/test/e2e/xla/reduce_int.mlir
+++ b/test/e2e/xla/reduce_int.mlir
@@ -1,5 +1,5 @@
-// RUN: iree-run-mlir -iree-hal-target-backends=vmla -iree-flow-experimental-dispatch-reduce %s | IreeFileCheck %s
-// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir -iree-hal-target-backends=vulkan-spirv  -iree-flow-experimental-dispatch-reduce %s)
+// RUN: iree-run-mlir -iree-hal-target-backends=vmla %s | IreeFileCheck %s
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir -iree-hal-target-backends=vulkan-spirv  %s)
 
 // Int sum values from [1, 10]
 // CHECK-LABEL: EXEC @reduce_sum_1x10xi32
diff --git a/test/e2e/xla/unidirectional_lstm.mlir b/test/e2e/xla/unidirectional_lstm.mlir
index b74feee..b373880 100644
--- a/test/e2e/xla/unidirectional_lstm.mlir
+++ b/test/e2e/xla/unidirectional_lstm.mlir
@@ -1,7 +1,7 @@
 // An example LSTM exported from a python reference model with dummy weights.
 
-// RUN: iree-run-mlir %s -iree-hal-target-backends=vmla -iree-flow-experimental-dispatch-reduce -input-value="1x5xf32=[0 1 0 3 4]" -input-value="1x5x2x2xf32=[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]" -export-all=false  | IreeFileCheck %s --implicit-check-not="[" --implicit-check-not="]"
-// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir %s -iree-hal-target-backends=vulkan-spirv -input-value="1x5xf32=[0 1 0 3 4]" -input-value="1x5x2x2xf32=[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]" -iree-flow-experimental-dispatch-reduce -export-all=false |  IreeFileCheck %s --implicit-check-not="[" --implicit-check-not="]" )
+// RUN: iree-run-mlir %s -iree-hal-target-backends=vmla -input-value="1x5xf32=[0 1 0 3 4]" -input-value="1x5x2x2xf32=[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]" -export-all=false  | IreeFileCheck %s --implicit-check-not="[" --implicit-check-not="]"
+// RUN: [[ $IREE_VULKAN_DISABLE == 1 ]] || (iree-run-mlir %s -iree-hal-target-backends=vulkan-spirv -input-value="1x5xf32=[0 1 0 3 4]" -input-value="1x5x2x2xf32=[1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]" -export-all=false |  IreeFileCheck %s --implicit-check-not="[" --implicit-check-not="]" )
 
 // Exported via the XLA HLO Importer
 // The resulting MLIR was modified by hand by changing all large constants to be