Add support for lowering ops that implement the `UKernelOpInterface` on the CUDA path. (#14101)

This change add support for lowering operations that implement the
`UKernelOpInterface` on the CUDA backend. This allows custom
dispatches to use ops that implement and be compiled down to CUDA.

Unlike the CPU side, the CUDA side needs more information than just
the number of workgroups to use. It also needs the workgroup size. To
allow specifying this, an #iree_codegen.export_config attribute is
added. This allows specifying the workgroup size to use on the custom
dispatch. This information is used to set the workgroup size to use
during `MaterializeInterfaces` in HAL.

An example of the bitcode usage is added with this PR. Since there are
no in-tree examples of CUDA micro-kernels, the bit code is linked
using the `iree-link-bitcode` (which is the
`iree-llvmcpu-link-bitcode` renamed).

Fixes #14028
diff --git a/build_tools/bazel/iree_bitcode_library.bzl b/build_tools/bazel/iree_bitcode_library.bzl
index 89ad587..2475b60 100644
--- a/build_tools/bazel/iree_bitcode_library.bzl
+++ b/build_tools/bazel/iree_bitcode_library.bzl
@@ -130,6 +130,101 @@
 
     if not out:
         out = "%s.bc" % (name)
+
+    native.genrule(
+        name = name,
+        srcs = bitcode_files,
+        outs = [out],
+        cmd = " && ".join([
+            " ".join([
+                "$(location %s)" % (link_tool),
+                "-o $(location %s)" % (out),
+                " ".join(["$(locations %s)" % (src) for src in bitcode_files]),
+            ]),
+        ]),
+        tools = [link_tool],
+        message = "Linking bitcode library %s to %s..." % (name, out),
+        output_to_bindir = 1,
+        **kwargs
+    )
+
+def iree_cuda_bitcode_library(
+        name,
+        cuda_arch,
+        srcs,
+        internal_hdrs = [],
+        copts = [],
+        out = None,
+        **kwargs):
+    """Builds an LLVM bitcode library for CUDA from an input file via clang.
+
+    Args:
+        name: Name of the target.
+        cuda_arch: Target sm architecture to compile for.
+        srcs: source files to pass to clang.
+        internal_hdrs: all headers transitively included by the source files.
+                       Unlike typical Bazel `hdrs`, these are not exposed as
+                       interface headers. This would normally be part of `srcs`,
+                       but separating it was easier for `bazel_to_cmake`, as
+                       CMake does not need this, and making this explicitly
+                       Bazel-only allows using `filegroup` on the Bazel side.
+        copts: additional flags to pass to clang.
+        out: output file name (defaults to name.bc).
+        **kwargs: any additional attributes to pass to the underlying rules.
+    """
+
+    clang_tool = "@llvm-project//clang:clang"
+    link_tool = "@llvm-project//llvm:llvm-link"
+    builtin_headers_dep = "@llvm-project//clang:builtin_headers_gen"
+    builtin_headers_path = "external/llvm-project/clang/staging/include/"
+
+    base_copts = [
+        "-x",
+        "cuda",
+
+        # Target architecture
+        "--cuda-gpu-arch=%s" % (cuda_arch),
+
+        # Suppress warnings
+        "-Wno-unknown-cuda-version",
+        "-nocudalib",
+        "--cuda-device-only",
+
+        # Optimized.
+        "-O3",
+
+        # Object file only in bitcode format:
+        "-c",
+        "-emit-llvm",
+    ]
+
+    bitcode_files = []
+    for src in srcs:
+        bitcode_out = "%s_%s.bc" % (name, src)
+        bitcode_files.append(bitcode_out)
+        native.genrule(
+            name = "gen_%s" % (bitcode_out),
+            srcs = [src, builtin_headers_dep] + internal_hdrs,
+            outs = [bitcode_out],
+            cmd = " && ".join([
+                " ".join([
+                    "$(location %s)" % (clang_tool),
+                    " ".join(base_copts + copts),
+                    "-o $(location %s)" % (bitcode_out),
+                    "$(location %s)" % (src),
+                ]),
+            ]),
+            tools = [
+                clang_tool,
+            ],
+            message = "Compiling %s to %s..." % (src, bitcode_out),
+            output_to_bindir = 1,
+            **kwargs
+        )
+
+    if not out:
+        out = "%s.bc" % (name)
+
     native.genrule(
         name = name,
         srcs = bitcode_files,
diff --git a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
index f6d6cb5..818d297 100644
--- a/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
+++ b/build_tools/bazel_to_cmake/bazel_to_cmake_converter.py
@@ -519,6 +519,25 @@
             f")\n\n"
         )
 
+    def iree_cuda_bitcode_library(
+        self, name, cuda_arch, srcs, internal_hdrs=None, copts=None
+    ):
+        name_block = self._convert_string_arg_block("NAME", name, quote=False)
+        cuda_arch_block = self._convert_string_arg_block(
+            "CUDA_ARCH", cuda_arch, quote=False
+        )
+        srcs_block = self._convert_srcs_block(srcs)
+        copts_block = self._convert_string_list_block("COPTS", copts, sort=False)
+
+        self._converter.body += (
+            f"iree_bitcode_library(\n"
+            f"{name_block}"
+            f"{cuda_arch_block}"
+            f"{srcs_block}"
+            f"{copts_block}"
+            f")\n\n"
+        )
+
     def iree_link_bitcode(self, name, bitcode_files):
         name_block = self._convert_string_arg_block("NAME", name, quote=False)
         bitcode_files_block = self._convert_srcs_block(
diff --git a/build_tools/cmake/iree_bitcode_library.cmake b/build_tools/cmake/iree_bitcode_library.cmake
index 593348a..c601e51 100644
--- a/build_tools/cmake/iree_bitcode_library.cmake
+++ b/build_tools/cmake/iree_bitcode_library.cmake
@@ -116,6 +116,91 @@
   )
 endfunction()
 
+function(iree_cuda_bitcode_library)
+  cmake_parse_arguments(
+    _RULE
+    ""
+    "NAME;OUT;CUDA_ARCH"
+    "SRCS;COPTS"
+    ${ARGN}
+  )
+
+  if(DEFINED _RULE_OUT)
+    set(_OUT "${_RULE_OUT}")
+  else()
+    set(_OUT "${_RULE_NAME}.bc")
+  endif()
+
+  set(_CUDA_ARCH "${_RULE_CUDA_ARCH}")
+  
+  set(_COPTS
+    "-x" "cuda"
+    
+    # Target architecture.
+    "--cuda-gpu-arch=${_CUDA_ARCH}"
+
+    # Suppress warnings about missing path to cuda lib,
+    # and benign warning about CUDA version.
+    "-Wno-unknown-cuda-version"
+    "-nocudalib"
+    "--cuda-device-only"
+
+    # Optimized and unstamped.
+    "-O3"
+
+    # Object file only in bitcode format:
+    "-c"
+    "-emit-llvm"
+  )
+
+  set(_BITCODE_FILES)
+  foreach(_SRC ${_RULE_SRCS})
+    get_filename_component(_BITCODE_SRC_PATH "${_SRC}" REALPATH)
+    set(_BITCODE_FILE "${_RULE_NAME}_${_SRC}.bc")
+    list(APPEND _BITCODE_FILES ${_BITCODE_FILE})
+    add_custom_command(
+      OUTPUT
+        "${_BITCODE_FILE}"
+      COMMAND
+        "${IREE_CLANG_BINARY}"
+        ${_COPTS}
+        "${_BITCODE_SRC_PATH}"
+        "-o"
+        "${_BITCODE_FILE}"
+      DEPENDS
+        "${IREE_CLANG_BINARY}"
+        "${_SRC}"
+      COMMENT
+        "Compiling ${_SRC} to ${_BITCODE_FILE}"
+      VERBATIM
+    )
+  endforeach()
+
+  add_custom_command(
+    OUTPUT
+      ${_OUT}
+    COMMAND
+      ${IREE_LLVM_LINK_BINARY}
+      ${_BITCODE_FILES}
+      "-o"
+      "${_OUT}"
+    DEPENDS
+      ${IREE_LLVM_LINK_BINARY}
+      ${_BITCODE_FILES}
+    COMMENT
+      "Linking bitcode to ${_OUT}"
+    VERBATIM
+  )
+
+  # Only add iree_${NAME} as custom target doesn't support aliasing to
+  # iree::${NAME}.
+  iree_package_name(_PACKAGE_NAME)
+  add_custom_target("${_PACKAGE_NAME}_${_RULE_NAME}"
+    DEPENDS "${_OUT}"
+  )
+endfunction()
+
+
 # iree_link_bitcode()
 #
 # Builds an LLVM bitcode library from an input file via clang
diff --git a/build_tools/cmake/iree_bytecode_module.cmake b/build_tools/cmake/iree_bytecode_module.cmake
index 3be5838..8bbfaff 100644
--- a/build_tools/cmake/iree_bytecode_module.cmake
+++ b/build_tools/cmake/iree_bytecode_module.cmake
@@ -136,6 +136,14 @@
     get_filename_component(_FRIENDLY_NAME "${_RULE_SRC}" NAME)
   endif()
 
+  set(_DEPENDS "")
+  iree_package_ns(_PACKAGE_NAME)
+  list(TRANSFORM _RULE_DEPENDS REPLACE "^::" "${_PACKAGE_NAME}::")
+  foreach(_DEPEND ${_RULE_DEPENDS})
+    string(REPLACE "::" "_" _DEPEND "${_DEPEND}")
+    list(APPEND _DEPENDS ${_DEPEND})
+  endforeach()
+
   add_custom_command(
     OUTPUT
       ${_OUTPUT_FILES}
@@ -146,7 +154,7 @@
       ${_COMPILE_TOOL}
       ${_LINKER_TOOL_EXECUTABLE}
       ${_RULE_SRC}
-      ${_RULE_DEPENDS}
+      ${_DEPENDS}
     COMMENT
       "Generating ${_MODULE_FILE_NAME} from ${_FRIENDLY_NAME}"
     VERBATIM
diff --git a/build_tools/cmake/iree_check_test.cmake b/build_tools/cmake/iree_check_test.cmake
index c30c014..b7d6a24 100644
--- a/build_tools/cmake/iree_check_test.cmake
+++ b/build_tools/cmake/iree_check_test.cmake
@@ -40,6 +40,7 @@
 #       to use for the generated IREE module (.vmfb).
 #   TARGET_CPU_FEATURES: If specified, a string passed as argument to
 #       --iree-llvmcpu-target-cpu-features.
+#   DEPENDS: Optional. Additional dependencies beyond SRC and the tools.
 function(iree_check_test)
   if(NOT IREE_BUILD_TESTS)
     return()
@@ -57,7 +58,7 @@
     _RULE
     ""
     "NAME;SRC;TARGET_BACKEND;DRIVER;MODULE_FILE_NAME"
-    "COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES;TIMEOUT"
+    "COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES;DEPENDS;TIMEOUT"
     ${ARGN}
   )
 
@@ -84,7 +85,7 @@
   if (_RULE_TARGET_CPU_FEATURES)
     list(APPEND _BASE_COMPILER_FLAGS "--iree-llvmcpu-target-cpu-features=${_RULE_TARGET_CPU_FEATURES}")
   endif()
-
+  
   iree_bytecode_module(
     NAME
       "${_MODULE_NAME}"
@@ -95,6 +96,8 @@
     FLAGS
       "${_BASE_COMPILER_FLAGS}"
       "${_RULE_COMPILER_FLAGS}"
+    DEPENDS
+      "${_RULE_DEPENDS}"
   )
 
   set(_RUNNER_TARGET "iree-check-module")
@@ -154,6 +157,7 @@
 #       is added automatically.
 #   TARGET_CPU_FEATURES: If specified, a string passed as argument to
 #       --iree-llvmcpu-target-cpu-features.
+#   DEPENDS: Optional. Additional dependencies beyond SRC and the tools.
 function(iree_check_single_backend_test_suite)
   if(NOT IREE_BUILD_TESTS)
     return()
@@ -167,7 +171,7 @@
     _RULE
     ""
     "NAME;TARGET_BACKEND;DRIVER"
-    "SRCS;COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES;TIMEOUT"
+    "SRCS;COMPILER_FLAGS;RUNNER_ARGS;LABELS;TARGET_CPU_FEATURES;DEPENDS;TIMEOUT"
     ${ARGN}
   )
 
@@ -239,6 +243,8 @@
         ${_RULE_LABELS}
       TARGET_CPU_FEATURES
         ${_RULE_TARGET_CPU_FEATURES}
+      DEPENDS
+        ${_RULE_DEPENDS}
       TIMEOUT
         ${_RULE_TIMEOUT}
     )
diff --git a/compiler/plugins/target/CUDA/CUDATarget.cpp b/compiler/plugins/target/CUDA/CUDATarget.cpp
index 0e9ad1e..3ca76c4 100644
--- a/compiler/plugins/target/CUDA/CUDATarget.cpp
+++ b/compiler/plugins/target/CUDA/CUDATarget.cpp
@@ -291,6 +291,11 @@
   // Link user modules and libdevice (if required).
   // Note that linking order matters:
   llvm::Linker linker(module);
+  if (failed(linkCmdlineBitcodeFile(loc, linker, llvm::Linker::OverrideFromSrc,
+                                    targetMachine, module.getContext()))) {
+    return failure();
+  }
+
   unsigned linkerFlags =
       llvm::Linker::LinkOnlyNeeded | llvm::Linker::OverrideFromSrc;
   if (failed(linkBitcodeObjects(loc, linker, linkerFlags, targetMachine,
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/BUILD.bazel
index 49181a9..2949603 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/BUILD.bazel
@@ -102,7 +102,7 @@
         ),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "IREECodegenAttributes.td",
+    td_file = "IREECodegenDialect.td",
     deps = [":td_files"],
 )
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/CMakeLists.txt
index 7efe4f0..b76e899 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/CMakeLists.txt
@@ -59,7 +59,7 @@
   NAME
     IREECodegenDialectGen
   TD_FILE
-    "IREECodegenAttributes.td"
+    "IREECodegenDialect.td"
   OUTS
     --gen-dialect-decls IREECodegenDialect.h.inc
     --gen-dialect-defs IREECodegenDialect.cpp.inc
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.cpp b/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.cpp
index 6b56930..4d3f1ee 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.cpp
@@ -71,6 +71,29 @@
 namespace Codegen {
 
 //===----------------------------------------------------------------------===//
+// iree_codegen.export_config
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+ExportConfigAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                         ArrayAttr workgroupSize) {
+  if (!workgroupSize) {
+    return success();
+  }
+  if (workgroupSize.size() > 3) {
+    return emitError() << "expected workgroup size to have atmost 3 entries";
+  }
+  if (!llvm::all_of(workgroupSize, [](Attribute attr) {
+        auto intAttr = llvm::dyn_cast<IntegerAttr>(attr);
+        return intAttr && intAttr.getType().isIndex();
+      })) {
+    return emitError()
+           << "expected workgroup size to contain values of index type";
+  }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
 // iree_codegen.translation_info
 //===----------------------------------------------------------------------===//
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.h b/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.h
index 869618a..4669313 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef IREE_COMPILER_CONVERSION_COMMON_LOWERINGCONFIG_H_
-#define IREE_COMPILER_CONVERSION_COMMON_LOWERINGCONFIG_H_
+#ifndef IREE_COMPILER_CODEGEN_DIALECT_LOWERINGCONFIG_H_
+#define IREE_COMPILER_CODEGEN_DIALECT_LOWERINGCONFIG_H_
 
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
@@ -164,4 +164,4 @@
 } // namespace iree_compiler
 } // namespace mlir
 
-#endif // IREE_COMPILER_CONVERSION_COMMON_LOWERINGCONFIG_H_
+#endif // IREE_COMPILER_CODEGEN_DIALECT_LOWERINGCONFIG_H_
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.td b/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.td
index 55ce459..f254c43 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/LoweringConfig.td
@@ -25,33 +25,33 @@
     : I32EnumAttrCase<"Mmt4dTilingExpert", 5>;
 def CPU_BufferOpsTileAndVectorize
     : I32EnumAttrCase<"CPUBufferOpsTileAndVectorize", 6>;
-
 def CPU_DataTiling
     : I32EnumAttrCase<"CPUDataTiling", 7>;
 
-def LLVMGPU_SimpleDistribute : I32EnumAttrCase<"LLVMGPUDistribute", 8>;
-def LLVMGPU_Vectorize : I32EnumAttrCase<"LLVMGPUVectorize", 9>;
-def LLVMGPU_MatmulSimt : I32EnumAttrCase<"LLVMGPUMatmulSimt", 10>;
-def LLVMGPU_MatmulTensorCore : I32EnumAttrCase<"LLVMGPUMatmulTensorCore", 11>;
-def LLVMGPU_TransposeSharedMem : I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 12>;
-def LLVMGPU_WarpReduction : I32EnumAttrCase<"LLVMGPUWarpReduction", 13>;
-def LLVMGPU_PackUnPack : I32EnumAttrCase<"LLVMGPUPackUnPack", 14>;
-def LLVMGPU_MatmulTensorCoreMmaSync : I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 15>;
+def LLVMGPU_Default : I32EnumAttrCase<"LLVMGPUDefault", 8>;
+def LLVMGPU_SimpleDistribute : I32EnumAttrCase<"LLVMGPUDistribute", 9>;
+def LLVMGPU_Vectorize : I32EnumAttrCase<"LLVMGPUVectorize", 10>;
+def LLVMGPU_MatmulSimt : I32EnumAttrCase<"LLVMGPUMatmulSimt", 11>;
+def LLVMGPU_MatmulTensorCore : I32EnumAttrCase<"LLVMGPUMatmulTensorCore", 12>;
+def LLVMGPU_TransposeSharedMem : I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 13>;
+def LLVMGPU_WarpReduction : I32EnumAttrCase<"LLVMGPUWarpReduction", 14>;
+def LLVMGPU_PackUnPack : I32EnumAttrCase<"LLVMGPUPackUnPack", 15>;
+def LLVMGPU_MatmulTensorCoreMmaSync : I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 16>;
 
 def SPIRV_BaseDistribute
-    : I32EnumAttrCase<"SPIRVBaseDistribute", 16>;
+    : I32EnumAttrCase<"SPIRVBaseDistribute", 17>;
 def SPIRV_BaseVectorize
-    : I32EnumAttrCase<"SPIRVBaseVectorize", 17>;
+    : I32EnumAttrCase<"SPIRVBaseVectorize", 18>;
 def SPIRV_MatmulPromoteVectorize
-    : I32EnumAttrCase<"SPIRVMatmulPromoteVectorize", 18>;
+    : I32EnumAttrCase<"SPIRVMatmulPromoteVectorize", 19>;
 def SPIRV_CooperativeMatrixVectorize
-    : I32EnumAttrCase<"SPIRVCooperativeMatrixVectorize", 19>;
+    : I32EnumAttrCase<"SPIRVCooperativeMatrixVectorize", 20>;
 def SPIRV_SubgroupReduce
-    : I32EnumAttrCase<"SPIRVSubgroupReduce", 20>;
+    : I32EnumAttrCase<"SPIRVSubgroupReduce", 21>;
 def SPIRV_WinogradVectorize
-    : I32EnumAttrCase<"SPIRVWinogradVectorize", 21>;
+    : I32EnumAttrCase<"SPIRVWinogradVectorize", 22>;
 
-def VMVX_Default : I32EnumAttrCase<"VMVXDefault", 22>;
+def VMVX_Default : I32EnumAttrCase<"VMVXDefault", 23>;
 
 
 def Linalg_TransformDialectCodegen
@@ -68,7 +68,7 @@
             CPU_Default, CPU_DoubleTilingExpert, CPU_DoubleTilingPadExpert,
             CPU_DoubleTilingPeelingExpert, CPU_ConvTileAndDecomposeExpert,
             CPU_Mmt4dTilingExpert, CPU_BufferOpsTileAndVectorize,
-            CPU_DataTiling, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
+            CPU_DataTiling, LLVMGPU_Default, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
             LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore,
             LLVMGPU_TransposeSharedMem, LLVMGPU_WarpReduction,
             LLVMGPU_PackUnPack, LLVMGPU_MatmulTensorCoreMmaSync,
@@ -240,4 +240,19 @@
   let genVerifyDecl = 1;
 }
 
+def IREECodegen_ExportConfig : AttrDef<IREECodegen_Dialect, "ExportConfig", []> {
+  let mnemonic = "export_config";
+  let summary = "User defined workgroup size specification";
+  let description = [{
+    Allows setting workgroup size for pre-formed dispatches.
+  }];
+  let parameters = (ins
+    AttrParameter<"ArrayAttr", "Workgroup Size to use">:$workgroup_size
+  );
+  let assemblyFormat = [{
+    `<` `workgroup_size` `=` $workgroup_size `>`
+  }];
+  let genVerifyDecl = 1;
+}
+
 #endif // IREE_COMPILER_CODEGEN_DIALECT_LOWERINGCONFIG
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/test/BUILD.bazel
index 7cae226..f3f67b5 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/test/BUILD.bazel
@@ -18,6 +18,7 @@
     name = "lit",
     srcs = enforce_glob(
         [
+            "invalid.mlir",
             "lowering_config_attr.mlir",
             "ukernel_ops.mlir",
         ],
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/test/CMakeLists.txt
index 84a8753..62091dd 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/test/CMakeLists.txt
@@ -14,6 +14,7 @@
   NAME
     lit
   SRCS
+    "invalid.mlir"
     "lowering_config_attr.mlir"
     "ukernel_ops.mlir"
   TOOLS
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/test/invalid.mlir b/compiler/src/iree/compiler/Codegen/Dialect/test/invalid.mlir
new file mode 100644
index 0000000..2917002
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Dialect/test/invalid.mlir
@@ -0,0 +1,21 @@
+// RUN: iree-opt --split-input-file --verify-diagnostics %s
+
+module {
+  func.func @export_config_invalid_type() attributes {
+    // expected-error @+1 {{expected workgroup size to contain values of index type}}
+    export_config = #iree_codegen.export_config<workgroup_size = [4, 1]>
+  } {
+    return
+  }
+}
+
+// -----
+
+module {
+  func.func @export_config_invalid_type() attributes {
+    // expected-error @+1 {{expected workgroup size to have atmost 3 entries}}
+    export_config = #iree_codegen.export_config<workgroup_size = [4: index, 1: index, 1: index, 1: index]>
+  } {
+    return
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/test/lowering_config_attr.mlir b/compiler/src/iree/compiler/Codegen/Dialect/test/lowering_config_attr.mlir
index 29e2315..fd2c094 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/test/lowering_config_attr.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/test/lowering_config_attr.mlir
@@ -60,3 +60,13 @@
 // CHECK: #translation = #iree_codegen.translation_info<CPUDefault>
 // CHECK: #compilation = #iree_codegen.compilation_info<lowering_config = #config, translation_info = #translation, workgroup_size = [16, 4, 1], subgroup_size = 32>
 
+// -----
+
+module {
+  func.func @test() attributes {
+    export_config = #iree_codegen.export_config<workgroup_size = [4: index, 1: index]>
+  } {
+    return
+  }
+}
+// CHECK: #iree_codegen.export_config<workgroup_size = [4 : index, 1 : index]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index 799e688..d2a7c90 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -76,6 +76,7 @@
         "//compiler/src/iree/compiler/Codegen/Common:TransformDialectInterpreterPass",
         "//compiler/src/iree/compiler/Codegen/Common/GPU:CommonGPUPasses",
         "//compiler/src/iree/compiler/Codegen/Dialect:IREECodegenDialect",
+        "//compiler/src/iree/compiler/Codegen/Interfaces:UKernelOpInterface",
         "//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions",
         "//compiler/src/iree/compiler/Codegen/LLVMGPU/Utils",
         "//compiler/src/iree/compiler/Codegen/TransformStrategies/GPU",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index 4800e70..74b8a77 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -125,6 +125,7 @@
     iree::compiler::Codegen::Common::GPU::CommonGPUPasses
     iree::compiler::Codegen::Common::TransformDialectInterpreterPass
     iree::compiler::Codegen::Dialect::IREECodegenDialect
+    iree::compiler::Codegen::Interfaces::UKernelOpInterface
     iree::compiler::Codegen::LLVMGPU::TransformExtensions::LLVMGPUExtensions
     iree::compiler::Codegen::LLVMGPU::Utils
     iree::compiler::Codegen::TransformStrategies::GPU
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
index ed9cd75..529bd25 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
@@ -60,7 +60,7 @@
 /// code.
 struct ConvertToNVVMPass : public ConvertToNVVMBase<ConvertToNVVMPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<LLVM::LLVMDialect, NVVM::NVVMDialect>();
+    registry.insert<gpu::GPUDialect, LLVM::LLVMDialect, NVVM::NVVMDialect>();
   }
   void runOnOperation() override {
     ModuleOp m = getOperation();
@@ -141,13 +141,7 @@
       populateNVGPUToNVVMConversionPatterns(converter, llvmPatterns);
       populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
       LLVMConversionTarget target(getContext());
-      populateFuncToLLVMFuncOpConversionPattern(converter, llvmPatterns);
       configureGpuToNVVMConversionLegality(target);
-      target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp funcOp) {
-        if (isEntryPoint(funcOp))
-          return false;
-        return true;
-      });
       if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) {
         signalPassFailure();
       }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 05310ae..3f37256 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -11,6 +11,7 @@
 #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Codegen/Common/UserConfig.h"
 #include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
+#include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
@@ -850,6 +851,16 @@
       workgroupSize);
 }
 
+/// Make UKernels take the LLVMGPUDefault lowering pipeline.
+static LogicalResult
+setUKernelConfig(func::FuncOp entryPoint,
+                 IREE::Codegen::UKernelOpInterface ukernelOp) {
+  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
+      entryPoint->getContext(),
+      IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDefault);
+  return setTranslationInfo(entryPoint, translationInfo);
+}
+
 /// Decides the tiling and distribution parameters for one convolution
 /// dimension. Returns true if we can succesfully deduce.
 ///
@@ -1062,6 +1073,9 @@
   if (auto packOp = dyn_cast<tensor::PackOp>(computeOp)) {
     return setPackConfig(entryPointFn, packOp);
   }
+  if (auto ukernelOp = dyn_cast<IREE::Codegen::UKernelOpInterface>(computeOp)) {
+    return setUKernelConfig(entryPointFn, ukernelOp);
+  }
 
   return setRootDefaultConfig(entryPointFn, computeOp);
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index 64a5dbb..8d20ab8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -153,6 +153,9 @@
 
   if (!testLoweringConfiguration && translationInfo.has_value()) {
     switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
+    case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDefault:
+      addGPUDefaultPassPipeline(executableLoweringPipeline);
+      break;
     case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDistribute:
       addGPUSimpleDistributePassPipeline(executableLoweringPipeline);
       break;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index e9eeeef..41fa43e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -452,6 +452,13 @@
       createRemoveSingleIterationLoopPass());
 }
 
+void addGPUDefaultPassPipeline(OpPassManager &pm) {
+  tileAndBufferize(pm);
+  auto &nestedModulePM = pm.nest<ModuleOp>();
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createRemoveSingleIterationLoopPass());
+}
+
 // Sub pipeline to make the address computation more explicit and
 // optimize them.
 // The idea here is to be less dependent on what the backend is able to
@@ -478,6 +485,8 @@
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
 
+  pm.addPass(createLowerUKernelOpsToCallsPass());
+
   // LinalgExt -> SCF
   pm.addNestedPass<func::FuncOp>(IREE::LinalgExt::createLinalgExtToLoopsPass());
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index d1cab28..3ccb9b2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -49,6 +49,9 @@
 /// Lowering reductions to warp reductions.
 void addGPUWarpReductionPassPipeline(OpPassManager &pm);
 
+/// Default pass pipeline on GPU, currently used only for the ukernel path.
+void addGPUDefaultPassPipeline(OpPassManager &pm);
+
 /// Populates passes needed to lower a XLA HLO op to NVVM/ROCDL dialect via
 /// the structured ops path. The pass manager `pm` in here should operate on
 /// the module within the IREE::HAL::ExecutableOp.
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index 6404e6c..f431cb3 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
@@ -313,8 +313,6 @@
   SmallVector<OpAsmParser::UnresolvedOperand> allOperands;
   std::unique_ptr<Region> bodyRegion = std::make_unique<Region>();
   std::unique_ptr<Region> workloadCountRegion = std::make_unique<Region>();
-  if (parser.parseOptionalAttrDict(result.attributes))
-    return failure();
   SmallVector<OpAsmParser::UnresolvedOperand> workloadOperands;
   SMLoc workloadOperandsLoc;
   (void)workloadOperandsLoc;
@@ -346,6 +344,8 @@
     if (typeListResult)
       return failure();
   }
+  if (parser.parseOptionalAttrDictWithKeyword(result.attributes))
+    return failure();
   if (parser.parseRegion(*bodyRegion))
     return failure();
   ensureTerminator(*bodyRegion, parser.getBuilder(), result.location);
@@ -377,7 +377,6 @@
 void DispatchRegionOp::print(OpAsmPrinter &p) {
   SmallVector<StringRef, 1> elidedAttrs;
   elidedAttrs.push_back("operand_segment_sizes");
-  p.printOptionalAttrDictWithKeyword((*this)->getAttrs(), elidedAttrs);
   if (!getWorkload().empty()) {
     p << "[" << getWorkload() << "]";
   }
@@ -398,7 +397,9 @@
     if (it.index() < getNumResults() - 1)
       p << ", ";
   }
-  p << ") ";
+  p << ")";
+  p.printOptionalAttrDictWithKeyword((*this)->getAttrs(), elidedAttrs);
+  p << " ";
 
   bool printTerminator = true;
   if (auto *term =
@@ -1106,6 +1107,7 @@
   auto newOp = rewriter.create<DispatchWorkgroupsOp>(
       getLoc(), getWorkload(), newResultTypes, newResultDims, newArguments,
       newArgumentDims, newTiedOperandIndices, getOperation()->getAttrs());
+  newOp->setDialectAttrs(getOperation()->getDialectAttrs());
   auto &newBody = newOp.getClosureBodyRegion();
   newBody.takeBody(getClosureBodyRegion());
 
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
index fd2e16d..ef6be67 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/OutlineDispatchRegions.cpp
@@ -386,13 +386,12 @@
   auto exportOp = builder.create<ExecutableExportOp>(
       regionOp.getLoc(), workgroupFuncOp.getName(),
       SymbolRefAttr::get(workgroupFuncOp));
-  if (!regionOp.getWorkgroupCount().empty())
-    exportOp.getWorkgroupCount().takeBody(regionOp.getWorkgroupCount());
 
   // Move over the workgroup count region, if present.
   if (!regionOp.getWorkgroupCount().empty()) {
     exportOp.getWorkgroupCount().takeBody(regionOp.getWorkgroupCount());
   }
+  exportOp->setDialectAttrs(regionOp->getDialectAttrs());
 
   // Finally convert the dispatch region into a dispatch to the outlined func.
   return convertToDispatchOp(regionOp, executableOp, exportOp);
diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir
index c2d0a6e..19139b2 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir
@@ -31,7 +31,7 @@
   //      CHECK:   flow.dispatch.tensor.store %[[matmul]], %[[arg5]], offsets = [0, 0], sizes = [5, 11], strides = [1, 1] : tensor<5x11xf32> -> !flow.dispatch.tensor<writeonly:tensor<5x11xf32>>
   //      CHECK:   flow.return
   //      CHECK: }
-  %r1 = flow.dispatch.region {stream.affinity = #hal.affinity.queue<[0]>} -> (tensor<5x11xf32>) {
+  %r1 = flow.dispatch.region -> (tensor<5x11xf32>) attributes {stream.affinity = #hal.affinity.queue<[0]>} {
     %zero = arith.constant 0.0 : f32
     %0 = tensor.empty() : tensor<5x11xf32>
     %1 = linalg.fill ins(%zero : f32) outs(%0 : tensor<5x11xf32>) -> tensor<5x11xf32>
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMLinkerUtils.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMLinkerUtils.cpp
index 05868ee..57ba456 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMLinkerUtils.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/LLVMLinkerUtils.cpp
@@ -17,7 +17,7 @@
 namespace HAL {
 
 static llvm::cl::opt<std::string> clBitcodeFile(
-    "iree-llvmcpu-link-bitcode",
+    "iree-link-bitcode",
     llvm::cl::desc("Paths of additional bitcode file to load and link."),
     llvm::cl::init(""));
 
@@ -131,9 +131,15 @@
     return mlir::emitError(loc) << "failed reading user bitcode file `"
                                 << clBitcodeFile << "`: " << ec.message();
   }
+  auto setAlwaysInline = [&](llvm::Module &module) {
+    for (auto &func : module.getFunctionList()) {
+      func.addFnAttr(llvm::Attribute::AlwaysInline);
+    }
+  };
   if (failed(linkBitcodeModule(
           loc, linker, linkerFlags, targetMachine, clBitcodeFile,
-          llvm::parseBitcodeFile(*bitcodeBufferRef->get(), context)))) {
+          llvm::parseBitcodeFile(*bitcodeBufferRef->get(), context),
+          setAlwaysInline))) {
     return mlir::emitError(loc) << "failed linking in user bitcode file `"
                                 << clBitcodeFile << "` for target triple '"
                                 << targetMachine.getTargetTriple().str() << "'";
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Transforms/BUILD.bazel
index ba624eb..b109c08 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/BUILD.bazel
@@ -40,6 +40,7 @@
         "Passes.h",
     ],
     deps = [
+        "//compiler/src/iree/compiler/Codegen/Dialect:IREECodegenDialect",
         "//compiler/src/iree/compiler/Dialect/Flow/IR",
         "//compiler/src/iree/compiler/Dialect/HAL/Analysis",
         "//compiler/src/iree/compiler/Dialect/HAL/Conversion",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/Transforms/CMakeLists.txt
index c9af4cc..bdf4010 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/CMakeLists.txt
@@ -51,6 +51,7 @@
     MLIRSupport
     MLIRTensorDialect
     MLIRTransforms
+    iree::compiler::Codegen::Dialect::IREECodegenDialect
     iree::compiler::Dialect::Flow::IR
     iree::compiler::Dialect::HAL::Analysis
     iree::compiler::Dialect::HAL::Conversion
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
index 9128967..96f0425 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeInterfaces.cpp
@@ -7,6 +7,7 @@
 #include <memory>
 #include <utility>
 
+#include "iree/compiler/Codegen/Dialect/LoweringConfig.h"
 #include "iree/compiler/Dialect/HAL/Analysis/BindingLayout.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
@@ -339,10 +340,26 @@
     for (auto variantOp : variantOps) {
       // Declare the entry point on the target.
       OpBuilder targetBuilder(variantOp.getInnerModule());
+      // Check if workgroup size is set externally.
+      ArrayAttr workgroupSize;
+      for (auto attr : exportOp->getAttrs()) {
+        if (attr.getValue().isa<IREE::Codegen::ExportConfigAttr>()) {
+          workgroupSize = attr.getValue()
+                              .cast<IREE::Codegen::ExportConfigAttr>()
+                              .getWorkgroupSize();
+          if (workgroupSize.size() < 3) {
+            SmallVector<Attribute> workgroupSizeVals =
+                llvm::to_vector(workgroupSize);
+            workgroupSizeVals.resize(3, targetBuilder.getIndexAttr(1));
+            workgroupSize = targetBuilder.getArrayAttr(workgroupSizeVals);
+          }
+          break;
+        }
+      }
       auto newExportOp = targetBuilder.create<IREE::HAL::ExecutableExportOp>(
           exportOp.getLoc(),
           targetBuilder.getStringAttr(exportOp.getFunctionRef()),
-          targetBuilder.getIndexAttr(ordinal), layoutAttr, ArrayAttr{},
+          targetBuilder.getIndexAttr(ordinal), layoutAttr, workgroupSize,
           /*subgroup_size=*/IntegerAttr{},
           /*workgroup_local_memory=*/IntegerAttr{});
 
diff --git a/samples/custom_dispatch/cuda/kernels/CMakeLists.txt b/samples/custom_dispatch/cuda/kernels/CMakeLists.txt
index b64999a..300e68e 100644
--- a/samples/custom_dispatch/cuda/kernels/CMakeLists.txt
+++ b/samples/custom_dispatch/cuda/kernels/CMakeLists.txt
@@ -127,3 +127,33 @@
     "driver=cuda"
     "hostonly"
 )
+
+iree_cuda_bitcode_library(
+  NAME
+    cuda_ukernel
+  CUDA_ARCH
+    sm_60
+  SRCS
+    "ukernel.cu" 
+)
+
+iree_check_single_backend_test_suite(
+  NAME
+    check_cuda_ukernel
+  SRCS
+    "ukernel_example.mlir"
+  TARGET_BACKEND
+    "cuda"
+  COMPILER_FLAGS
+    "--iree-link-bitcode=cuda_ukernel.bc"
+  DRIVER
+    "cuda"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-nvidia"
+  DEPENDS
+    ::cuda_ukernel
+)
diff --git a/samples/custom_dispatch/cuda/kernels/ukernel.cu b/samples/custom_dispatch/cuda/kernels/ukernel.cu
new file mode 100644
index 0000000..a106431
--- /dev/null
+++ b/samples/custom_dispatch/cuda/kernels/ukernel.cu
@@ -0,0 +1,11 @@
+extern "C" __device__ void simple_mul_workgroup(float *lhs, size_t lhs_offset,
+                                                float *rhs, size_t rhs_offset,
+                                                float *result,
+                                                size_t result_offset,
+                                                size_t size) {
+  int threadId = threadIdx.x;
+  if (threadId < size) {
+    result[result_offset + threadId] =
+        lhs[lhs_offset + threadId] * rhs[rhs_offset + threadId];
+  }
+}
diff --git a/samples/custom_dispatch/cuda/kernels/ukernel_example.mlir b/samples/custom_dispatch/cuda/kernels/ukernel_example.mlir
new file mode 100644
index 0000000..a66feb9
--- /dev/null
+++ b/samples/custom_dispatch/cuda/kernels/ukernel_example.mlir
@@ -0,0 +1,48 @@
+func.func @ukernel_example() {
+  %s0 = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]> : tensor<10xf32>
+  %s1 = arith.constant dense<[0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0]> : tensor<10xf32>
+  %arg0 = util.optimization_barrier %s0 : tensor<10xf32>
+  %arg1 = util.optimization_barrier %s1 : tensor<10xf32>
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %dest = tensor.empty() : tensor<10xf32>
+  // Create a dispatch that uses a workgroup size of 4.
+  // `flow.dispatch.region` to capture the values needed to specify the number
+  // of threads to use in the `count` region.
+  %0 = flow.dispatch.region[] -> (tensor<10xf32>)
+      // Use `#iree_codegen.export_config` to specify control over the execution. Currently
+      // the workgroup size/block size.
+      // Note: The name "iree_codegen.export_config" is also important for it to be
+      // propagated through the compiler.
+      attributes {iree_codegen.export_config = #iree_codegen.export_config<workgroup_size = [4 : index]>} {
+    %id = flow.dispatch.workgroup.id[0] : index
+    %count = flow.dispatch.workgroup.count[0] : index
+
+    // Compute the offset and size of the slice
+    %offset = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%id]
+    %size = affine.min affine_map<(d0)[] -> (4, 10 - d0)>(%offset)[]
+
+    // Extract slices of the inputs and outputs.
+    %1 = tensor.extract_slice %arg0[%offset] [%size] [1] : tensor<10xf32> to tensor<?xf32>
+    %2 = tensor.extract_slice %arg1[%offset] [%size] [1] : tensor<10xf32> to tensor<?xf32>
+    %3 = tensor.extract_slice %dest[%offset] [%size] [1] : tensor<10xf32> to tensor<?xf32>
+
+    // Invoke the ukernel.
+    %4 = iree_codegen.ukernel.generic "simple_mul_workgroup"
+      ins(%1, %2 : tensor<?xf32>, tensor<?xf32>)
+      outs(%3 : tensor<?xf32>)
+      (%size : index)
+      // Set the operation to not incorporate any strides. The implementation
+      // expects no stride arguments.
+      strided_outer_dims(0) -> tensor<?xf32>
+
+    // Insert the result back into the result at the right position.
+    %5 = tensor.insert_slice %4 into %dest[%offset] [%size] [1] : tensor<?xf32> into tensor<10xf32>
+    flow.return %5 : tensor<10xf32>
+  } count() -> (index, index, index) {
+    flow.return %c3, %c1, %c1 : index, index, index
+  }
+  check.expect_almost_eq_const(%0, dense<[0.0, 2.0, 8.0, 18.0, 32.0, 50.0, 72.0, 98.0, 128.0, 162.0]> : tensor<10xf32>) : tensor<10xf32>
+  return
+}