[Codegen] Resolve memref.dim ops inside narrow type emulation pass (#23029)

This fixes a compilation failure I encountered when compiling Llama 70B
FP4 with data tiling enabled for gfx950. The issue was that memref.dim
ops on narrow-type memrefs (like f4E2M1FN) through fat_raw_buffer_cast
were marked illegal by the emulation pass but couldn't be resolved.

The fix adds a preprocessing step inside the narrow type emulation pass
that runs `populateResolveRankedShapedTypeResultDimsPatterns` before the
conversion.

Signed-off-by: Jorn Tuyls <jorn.tuyls@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp b/compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp
index 39272b6..82ec1b7 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EmulateNarrowType.cpp
@@ -599,6 +599,20 @@
 
   MLIRContext *ctx = root->getContext();
 
+  // Resolve memref.dim ops before emulation. This is needed because the
+  // emulation linearizes memrefs, changing their rank and shape semantics.
+  // Any memref.dim on a narrow-type memref must be traced back to its source
+  // dynamic dimensions before we can safely emulate.
+  {
+    RewritePatternSet dimPatterns(ctx);
+    memref::populateResolveRankedShapedTypeResultDimsPatterns(dimPatterns);
+    GreedyRewriteConfig config;
+    config.setRegionSimplificationLevel(GreedySimplifyRegionLevel::Disabled);
+    if (failed(applyPatternsGreedily(root, std::move(dimPatterns), config))) {
+      return root->emitOpError("failed to resolve shaped type result dims");
+    }
+  }
+
   arith::NarrowTypeEmulationConverter typeConverter(kLoadStoreEmulateBitwidth);
   memref::populateMemRefNarrowTypeEmulationConversions(typeConverter);
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_emulate_narrow_type.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_emulate_narrow_type.mlir
index 446c098..0beb8fa 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_emulate_narrow_type.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_emulate_narrow_type.mlir
@@ -10,3 +10,35 @@
 //       CHECK:     %[[CAST:.*]] = amdgpu.fat_raw_buffer_cast %[[ARG0]] resetOffset
 //  CHECK-SAME:       : memref<2048xi8> to memref<2048xi8, #amdgpu.address_space<fat_raw_buffer>>
 //       CHECK:     return %[[CAST]]
+
+// -----
+
+// Test combining memref.dim resolution with narrow type emulation and vector ops.
+// This tests a previously failing case:
+// 1. memref.alloc provides a buffer with dynamic dims
+// 2. amdgpu.fat_raw_buffer_cast converts to fat buffer addressing
+// 3. memref.dim queries the dimension (must be resolved before emulation)
+// 4. vector.load/store operates on the narrow type (must be emulated to i8)
+func.func @dim_resolution_with_vector_emulation(%size: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %alloc = memref.alloc(%size) : memref<?x128xi4>
+  %cast = amdgpu.fat_raw_buffer_cast %alloc resetOffset
+      : memref<?x128xi4> to memref<?x128xi4, #amdgpu.address_space<fat_raw_buffer>>
+  %dim = memref.dim %cast, %c0 : memref<?x128xi4, #amdgpu.address_space<fat_raw_buffer>>
+  // Use the dimension in a loop bound (realistic use case)
+  scf.for %i = %c0 to %dim step %c1 {
+    // Load narrow type vector - this must be emulated
+    %vec = vector.load %cast[%i, %c0] : memref<?x128xi4, #amdgpu.address_space<fat_raw_buffer>>, vector<8xi4>
+    vector.store %vec, %cast[%i, %c0] : memref<?x128xi4, #amdgpu.address_space<fat_raw_buffer>>, vector<8xi4>
+  }
+  return
+}
+
+// CHECK-LABEL: func.func @dim_resolution_with_vector_emulation(
+//  CHECK-SAME:     %[[SIZE:.*]]: index
+// Verify the loop uses the resolved dimension (the function argument)
+//       CHECK:   scf.for %{{.*}} = %{{.*}} to %[[SIZE]]
+// Verify vector operations are emulated to i8 (8xi4 -> 4xi8)
+//       CHECK:     vector.load %{{.*}} : memref<?xi8, #amdgpu.address_space<fat_raw_buffer>>, vector<4xi8>
+//       CHECK:     vector.store %{{.*}} : memref<?xi8, #amdgpu.address_space<fat_raw_buffer>>, vector<4xi8>