[Flow] Fixed dropped dim computations to handle some ambiguous cases. (#15035)

The rank-reduced version of `flow.dispatch.tensor.load/store`, suffers
from the same issue that upstream `tensor.extract_slice/insert_slice`
suffers from. The dropped dims computation is inherently ambiguous. This
is ongoing work (see
https://github.com/openxla/iree/pull/14851). Here once the number of
dropped dimensions have been found (while iterating from outer to inner)
no other dimensions need to be dropped.

Fixes #15016
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index 62547e8..a5925f4 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
@@ -109,10 +109,12 @@
                    ArrayRef<OpFoldResult> mixedSizes) {
   ArrayRef<int64_t> resultShape = slicedObjectType.getShape();
   llvm::SmallBitVector droppedDims(mixedSizes.size());
-  if (slicedObjectType.getRank() == mixedSizes.size()) {
+  size_t maxDroppedDims = mixedSizes.size() - resultShape.size();
+  if (maxDroppedDims == 0) {
     return droppedDims;
   }
   unsigned shapePos = 0;
+  int numSet = 0;
   for (const auto &size : llvm::enumerate(mixedSizes)) {
     std::optional<int64_t> sizeVal = getConstantIntValue(size.value());
     // If the size is not 1, or if the current matched dimension of the result
@@ -124,6 +126,10 @@
       continue;
     }
     droppedDims.set(size.index());
+    numSet++;
+    if (numSet == maxDroppedDims) {
+      break;
+    }
   }
   return droppedDims;
 }
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_folding.mlir b/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_folding.mlir
index 6f579c1..be71444 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_folding.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/test/tensor_folding.mlir
@@ -695,3 +695,17 @@
   %1 = flow.tensor.reshape %0 : tensor<?x4xf32>{%arg1} -> tensor<?x?xf32>{%arg2, %arg3}
   return %1 : tensor<?x?xf32>
 }
+
+// -----
+
+func.func @innermost_unit_dim(%4: !flow.dispatch.tensor<readonly:tensor<3x1x16x257x88xf16>>,
+    %arg0: index, %arg2 : index, %10 : index, %9 : index) -> tensor<?x?x?xf16> {
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  %11 = flow.dispatch.tensor.load %4, offsets = [1, 0, %arg0, %10, %arg2], sizes = [1, 1, %c16, %9, %c1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x1x16x257x88xf16>> -> tensor<?x?x?xf16>
+  return %11 : tensor<?x?x?xf16>
+}
+// CHECK-LABEL: func @innermost_unit_dim
+//  CHECK-SAME:     %[[DYNAMIC_DIM:[a-zA-Z0-9]+]]: index)
+//       CHECK:   flow.dispatch.tensor.load
+//  CHECK-SAME:       sizes = [1, 1, 16, %[[DYNAMIC_DIM]], 1]