diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/common.h b/compiler/plugins/target/ROCM/builtins/ukernel/common.h
index d046986..3113643 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/common.h
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/common.h
@@ -81,6 +81,31 @@
 int64_t __ockl_wfred_min_i64(int64_t);
 int32_t __ockl_wfred_min_i32(int32_t);
 
+#define __CLK_LOCAL_MEM_FENCE 0x01
+typedef unsigned __cl_mem_fence_flags;
+
+static inline void __threadfence_block() {
+  __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+}
+
+static inline void __work_group_barrier(__cl_mem_fence_flags flags) {
+  if (flags) {
+    __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+    __builtin_amdgcn_s_barrier();
+    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+  } else {
+    __builtin_amdgcn_s_barrier();
+  }
+}
+
+static inline void __barrier(int n) {
+  __work_group_barrier((__cl_mem_fence_flags)n);
+}
+
+[[clang::convergent]] static inline void __syncthreads() {
+  __barrier(__CLK_LOCAL_MEM_FENCE);
+}
+
 //===----------------------------------------------------------------------===//
 // Local replacements for HIP headers
 //===----------------------------------------------------------------------===//
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
index 4a6beef..0edf274 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i32.c
@@ -35,15 +35,18 @@
   uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
   // if there is only one max value holder, write and exit.
   if (__builtin_popcountll(laneHasMaxValmask) == 1) {
-    if (wgMax == laneMax)
+    if (wgMax == laneMax) {
       outputBuffer[output_offset] = laneResult;
-    return;
+    }
+  } else {
+    // if there are multiple max value holder, find smallest index (argmax
+    // semantics).
+    int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
+    laneResult = __ockl_wfred_min_i32(indexVal);
+    if (laneID == 0) {
+      outputBuffer[output_offset] = laneResult;
+    }
   }
-
-  // if there are multiple max value holder, find smallest index (argmax
-  // semantics).
-  int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
-  laneResult = __ockl_wfred_min_i32(indexVal);
-  if (laneID == 0)
-    outputBuffer[output_offset] = laneResult;
+  // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?
+  __threadfence_block();
 }
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
index 33c1522..552ab87 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f16i64.c
@@ -36,14 +36,18 @@
   uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
   // if there is only one max value holder, write and exit.
   if (__builtin_popcountll(laneHasMaxValmask) == 1) {
-    if (wgMax == laneMax)
+    if (wgMax == laneMax) {
       outputBuffer[output_offset] = laneResult;
-    return;
+    }
+  } else {
+    // if there are multiple max value holder, find smallest index (argmax
+    // semantics).
+    int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
+    laneResult = __ockl_wfred_min_i64(indexVal);
+    if (laneID == 0) {
+      outputBuffer[output_offset] = laneResult;
+    }
   }
-  // if there are multiple max value holder, find smallest index (argmax
-  // semantics).
-  int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
-  laneResult = __ockl_wfred_min_i64(indexVal);
-  if (laneID == 0)
-    outputBuffer[output_offset] = laneResult;
+  // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?
+  __threadfence_block();
 }
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
index f39d623..ec0c4c3 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i32.c
@@ -41,14 +41,18 @@
   uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
   // if there is only one max value holder, write and exit.
   if (__builtin_popcountll(laneHasMaxValmask) == 1) {
-    if (wgMax == laneMax)
+    if (wgMax == laneMax) {
       outputBuffer[output_offset] = laneResult;
-    return;
+    }
+  } else {
+    // if there are multiple max value holder, find smallest index (argmax
+    // semantics).
+    int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
+    laneResult = __ockl_wfred_min_i32(indexVal);
+    if (laneID == 0) {
+      outputBuffer[output_offset] = laneResult;
+    }
   }
-  // if there are multiple max value holder, find smallest index (argmax
-  // semantics).
-  int32_t indexVal = wgMax == laneMax ? laneResult : __INT32_MAX__;
-  laneResult = __ockl_wfred_min_i32(indexVal);
-  if (laneID == 0)
-    outputBuffer[output_offset] = laneResult;
+  // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?
+  __threadfence_block();
 }
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
index d6a9afb..40e7cae 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_argmax_f32i64.c
@@ -41,14 +41,18 @@
   uint64_t laneHasMaxValmask = __ballot(wgMax == laneMax);
   // if there is only one max value holder, write and exit.
   if (__builtin_popcountll(laneHasMaxValmask) == 1) {
-    if (wgMax == laneMax)
+    if (wgMax == laneMax) {
       outputBuffer[output_offset] = laneResult;
-    return;
+    }
+  } else {
+    // if there are multiple max value holder, find smallest index (argmax
+    // semantics).
+    int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
+    laneResult = __ockl_wfred_min_i64(indexVal);
+    if (laneID == 0) {
+      outputBuffer[output_offset] = laneResult;
+    }
   }
-  // if there are multiple max value holder, find smallest index (argmax
-  // semantics).
-  int64_t indexVal = wgMax == laneMax ? laneResult : INT64_MAX;
-  laneResult = __ockl_wfred_min_i64(indexVal);
-  if (laneID == 0)
-    outputBuffer[output_offset] = laneResult;
+  // TODO(bjacob): this fence should be on the caller side. Move to TileAndFuse?
+  __threadfence_block();
 }
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8.c
index 9029a86..064bedb 100644
--- a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8.c
+++ b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8.c
@@ -7,31 +7,17 @@
 #include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
 
 // Very naive kernel. TODO(bjacob):
-// 1. Inlining: the `always_inline` attribute here is correctly preserved in
-//    the bitcode, but isn't having the intended effect of inlining calls to
-//    this function. Making that work is key as various function parameters
-//    (e.g. `unroll_m`) are meant to be constants.
-// 2. Shared memory: can't allocate it within the microkernel (which is just a
+// 1. Shared memory: can't allocate it within the microkernel (which is just a
 //    helper device function, not the actual amdgpu_kernel). Need to get it
-//    passed down here as a `T [[clang::address_space(3)]] *` parameter.
-// 3. Better scheduling via either barrier intrinsics or inline assemby.
-// 4. Subgroups1x4 being asymmetric is a historical accident... should be 2x2.
+//    passed down here as additional parameters.
+// 2. Better scheduling via either barrier intrinsics or inline assemby.
 [[clang::always_inline]] void iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8(
     const int8_t *a_buffer, int64_t a_offset, const int8_t *b_buffer,
     int64_t b_offset, int32_t *c_buffer, int64_t c_offset, int32_t k_size,
     int32_t unroll_m, int32_t subgroups_m, int32_t unroll_n,
     int32_t subgroups_n, int32_t unroll_k) {
-  /*
-    TODO(bjacob): reenable this once inlining works.
-    // Load existing accumulators. This is a VLA, but should become fixed-size
-    // once this function is inlined and unroll_* factors become constants.
-    int32x4_t c[unroll_m][unroll_n];
-  */
-  // Load existing accumulators.
-  if (unroll_m > 8 || unroll_n > 2) {
-    __builtin_trap();
-  }
-  int32x4_t c[8][2];
+  // Load existing accumulators. The VLA becomes a normal array after inlining.
+  int32x4_t c[unroll_m][unroll_n];
   int32x4_t *c_global = (int32x4_t *)(c_buffer + c_offset);
   for (int m = 0; m < unroll_m; ++m) {
     for (int n = 0; n < unroll_n; ++n) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastAddressSpaceFunction.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastAddressSpaceFunction.cpp
index aad0618..1df82fd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastAddressSpaceFunction.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastAddressSpaceFunction.cpp
@@ -41,7 +41,7 @@
       bool anyCasted = false;
       for (auto operand : operands) {
         if (auto memrefType = dyn_cast<mlir::MemRefType>(operand.getType())) {
-          if (hasSharedMemoryAddressSpace(memrefType)) {
+          if (memrefType.getMemorySpace()) {
             mlir::MemRefType new_memrefType = mlir::MemRefType::get(
                 memrefType.getShape(), memrefType.getElementType(),
                 memrefType.getLayout());
