Integrate llvm-project at 4706251a3186 (#14505)

* Reset third_party/llvm-project: 4706251a3186c34da0ee8fd894f7e6b095da8fdc (2023-07-25 13:58:49 +0200): Clear release notes for 18.x

Co-authored-by: Stella Laurenzo <stellaraccident@gmail.com>
Co-authored-by: Jerry Wu <cheyuw@google.com>
diff --git a/build_tools/python/e2e_test_framework/models/model_groups.py b/build_tools/python/e2e_test_framework/models/model_groups.py
index 8b3fb0e..55a31f8 100644
--- a/build_tools/python/e2e_test_framework/models/model_groups.py
+++ b/build_tools/python/e2e_test_framework/models/model_groups.py
@@ -232,7 +232,8 @@
     torch_models.MODEL_CLIP_TEXT_SEQLEN64_FP32_TORCH,
     torch_models.MODEL_UNET_2D_FP32_TORCH,
     torch_models.EFFICIENTNET_B7_FP32_TORCH,
-    torch_models.BERT_LARGE_384_FP16_TORCH_BATCHES[1],
+    # TODO(#14515): Enable the benchmark suite after regenerating models.
+    # torch_models.BERT_LARGE_384_FP16_TORCH_BATCHES[1],
     torch_models.EFFICIENTNET_V2_S_FP16_TORCH,
 ]
 
@@ -243,7 +244,8 @@
     + BERT_LARGE_TORCH_BATCHES
     + RESNET50_TORCH_BATCHES
     + RESNET50_FP16_TORCH_BATCHES
-    + BERT_LARGE_FP16_TORCH_BATCHES
+    # TODO(#14515): Enable the benchmark suite after regenerating models.
+    # + BERT_LARGE_FP16_TORCH_BATCHES
     + BERT_LARGE_JAX_BATCHES
     + RESNET50_JAX_BATCHES
     + T5_LARGE_JAX_BATCHES
diff --git a/compiler/src/iree/compiler/API/api_exports.c b/compiler/src/iree/compiler/API/api_exports.c
index dc9ecb2..5a3cae7 100644
--- a/compiler/src/iree/compiler/API/api_exports.c
+++ b/compiler/src/iree/compiler/API/api_exports.c
@@ -178,6 +178,7 @@
 extern void mlirBlockGetParentOperation();
 extern void mlirBlockGetParentRegion();
 extern void mlirBlockGetTerminator();
+extern void mlirBlockInsertArgument();
 extern void mlirBlockInsertOwnedOperation();
 extern void mlirBlockInsertOwnedOperationAfter();
 extern void mlirBlockInsertOwnedOperationBefore();
@@ -193,6 +194,8 @@
 extern void mlirContextAppendDialectRegistry();
 extern void mlirContextAttachDiagnosticHandler();
 extern void mlirContextCreate();
+extern void mlirContextCreateWithRegistry();
+extern void mlirContextCreateWithThreading();
 extern void mlirContextDestroy();
 extern void mlirContextDetachDiagnosticHandler();
 extern void mlirContextEnableMultithreading();
@@ -204,6 +207,7 @@
 extern void mlirContextIsRegisteredOperation();
 extern void mlirContextLoadAllAvailableDialects();
 extern void mlirContextSetAllowUnregisteredDialects();
+extern void mlirContextSetThreadPool();
 extern void mlirCreateExternalPass();
 extern void mlirDenseArrayAttrGetTypeID();
 extern void mlirDenseArrayGetNumElements();
@@ -387,6 +391,8 @@
 extern void mlirIntegerTypeUnsignedGet();
 extern void mlirIsGlobalDebugEnabled();
 extern void mlirLinalgFillBuiltinNamedOpRegion();
+extern void mlirLlvmThreadPoolCreate();
+extern void mlirLlvmThreadPoolDestroy();
 extern void mlirLocationCallSiteGet();
 extern void mlirLocationEqual();
 extern void mlirLocationFileLineColGet();
@@ -474,6 +480,7 @@
 extern void mlirOperationRemoveFromParent();
 extern void mlirOperationSetAttributeByName();
 extern void mlirOperationSetOperand();
+extern void mlirOperationSetOperands();
 extern void mlirOperationStateAddAttributes();
 extern void mlirOperationStateAddOperands();
 extern void mlirOperationStateAddOwnedRegions();
@@ -514,6 +521,7 @@
 extern void mlirRegionInsertOwnedBlock();
 extern void mlirRegionInsertOwnedBlockAfter();
 extern void mlirRegionInsertOwnedBlockBefore();
+extern void mlirRegionTakeBody();
 extern void mlirRegisterLinalgPasses();
 extern void mlirShapedTypeGetDimSize();
 extern void mlirShapedTypeGetDynamicSize();
@@ -817,6 +825,7 @@
   x += (uintptr_t)&mlirBlockGetParentOperation;
   x += (uintptr_t)&mlirBlockGetParentRegion;
   x += (uintptr_t)&mlirBlockGetTerminator;
+  x += (uintptr_t)&mlirBlockInsertArgument;
   x += (uintptr_t)&mlirBlockInsertOwnedOperation;
   x += (uintptr_t)&mlirBlockInsertOwnedOperationAfter;
   x += (uintptr_t)&mlirBlockInsertOwnedOperationBefore;
@@ -832,6 +841,8 @@
   x += (uintptr_t)&mlirContextAppendDialectRegistry;
   x += (uintptr_t)&mlirContextAttachDiagnosticHandler;
   x += (uintptr_t)&mlirContextCreate;
+  x += (uintptr_t)&mlirContextCreateWithRegistry;
+  x += (uintptr_t)&mlirContextCreateWithThreading;
   x += (uintptr_t)&mlirContextDestroy;
   x += (uintptr_t)&mlirContextDetachDiagnosticHandler;
   x += (uintptr_t)&mlirContextEnableMultithreading;
@@ -843,6 +854,7 @@
   x += (uintptr_t)&mlirContextIsRegisteredOperation;
   x += (uintptr_t)&mlirContextLoadAllAvailableDialects;
   x += (uintptr_t)&mlirContextSetAllowUnregisteredDialects;
+  x += (uintptr_t)&mlirContextSetThreadPool;
   x += (uintptr_t)&mlirCreateExternalPass;
   x += (uintptr_t)&mlirDenseArrayAttrGetTypeID;
   x += (uintptr_t)&mlirDenseArrayGetNumElements;
@@ -1026,6 +1038,8 @@
   x += (uintptr_t)&mlirIntegerTypeUnsignedGet;
   x += (uintptr_t)&mlirIsGlobalDebugEnabled;
   x += (uintptr_t)&mlirLinalgFillBuiltinNamedOpRegion;
+  x += (uintptr_t)&mlirLlvmThreadPoolCreate;
+  x += (uintptr_t)&mlirLlvmThreadPoolDestroy;
   x += (uintptr_t)&mlirLocationCallSiteGet;
   x += (uintptr_t)&mlirLocationEqual;
   x += (uintptr_t)&mlirLocationFileLineColGet;
@@ -1113,6 +1127,7 @@
   x += (uintptr_t)&mlirOperationRemoveFromParent;
   x += (uintptr_t)&mlirOperationSetAttributeByName;
   x += (uintptr_t)&mlirOperationSetOperand;
+  x += (uintptr_t)&mlirOperationSetOperands;
   x += (uintptr_t)&mlirOperationStateAddAttributes;
   x += (uintptr_t)&mlirOperationStateAddOperands;
   x += (uintptr_t)&mlirOperationStateAddOwnedRegions;
@@ -1153,6 +1168,7 @@
   x += (uintptr_t)&mlirRegionInsertOwnedBlock;
   x += (uintptr_t)&mlirRegionInsertOwnedBlockAfter;
   x += (uintptr_t)&mlirRegionInsertOwnedBlockBefore;
+  x += (uintptr_t)&mlirRegionTakeBody;
   x += (uintptr_t)&mlirRegisterLinalgPasses;
   x += (uintptr_t)&mlirShapedTypeGetDimSize;
   x += (uintptr_t)&mlirShapedTypeGetDynamicSize;
diff --git a/compiler/src/iree/compiler/API/api_exports.def b/compiler/src/iree/compiler/API/api_exports.def
index a068fd8..986cb46 100644
--- a/compiler/src/iree/compiler/API/api_exports.def
+++ b/compiler/src/iree/compiler/API/api_exports.def
@@ -170,6 +170,7 @@
   mlirBlockGetParentOperation
   mlirBlockGetParentRegion
   mlirBlockGetTerminator
+  mlirBlockInsertArgument
   mlirBlockInsertOwnedOperation
   mlirBlockInsertOwnedOperationAfter
   mlirBlockInsertOwnedOperationBefore
@@ -185,6 +186,8 @@
   mlirContextAppendDialectRegistry
   mlirContextAttachDiagnosticHandler
   mlirContextCreate
+  mlirContextCreateWithRegistry
+  mlirContextCreateWithThreading
   mlirContextDestroy
   mlirContextDetachDiagnosticHandler
   mlirContextEnableMultithreading
@@ -196,6 +199,7 @@
   mlirContextIsRegisteredOperation
   mlirContextLoadAllAvailableDialects
   mlirContextSetAllowUnregisteredDialects
+  mlirContextSetThreadPool
   mlirCreateExternalPass
   mlirDenseArrayAttrGetTypeID
   mlirDenseArrayGetNumElements
@@ -379,6 +383,8 @@
   mlirIntegerTypeUnsignedGet
   mlirIsGlobalDebugEnabled
   mlirLinalgFillBuiltinNamedOpRegion
+  mlirLlvmThreadPoolCreate
+  mlirLlvmThreadPoolDestroy
   mlirLocationCallSiteGet
   mlirLocationEqual
   mlirLocationFileLineColGet
@@ -466,6 +472,7 @@
   mlirOperationRemoveFromParent
   mlirOperationSetAttributeByName
   mlirOperationSetOperand
+  mlirOperationSetOperands
   mlirOperationStateAddAttributes
   mlirOperationStateAddOperands
   mlirOperationStateAddOwnedRegions
@@ -506,6 +513,7 @@
   mlirRegionInsertOwnedBlock
   mlirRegionInsertOwnedBlockAfter
   mlirRegionInsertOwnedBlockBefore
+  mlirRegionTakeBody
   mlirRegisterLinalgPasses
   mlirShapedTypeGetDimSize
   mlirShapedTypeGetDynamicSize
diff --git a/compiler/src/iree/compiler/API/api_exports.ld b/compiler/src/iree/compiler/API/api_exports.ld
index cc7e17a..a252dcb 100644
--- a/compiler/src/iree/compiler/API/api_exports.ld
+++ b/compiler/src/iree/compiler/API/api_exports.ld
@@ -171,6 +171,7 @@
     mlirBlockGetParentOperation;
     mlirBlockGetParentRegion;
     mlirBlockGetTerminator;
+    mlirBlockInsertArgument;
     mlirBlockInsertOwnedOperation;
     mlirBlockInsertOwnedOperationAfter;
     mlirBlockInsertOwnedOperationBefore;
@@ -186,6 +187,8 @@
     mlirContextAppendDialectRegistry;
     mlirContextAttachDiagnosticHandler;
     mlirContextCreate;
+    mlirContextCreateWithRegistry;
+    mlirContextCreateWithThreading;
     mlirContextDestroy;
     mlirContextDetachDiagnosticHandler;
     mlirContextEnableMultithreading;
@@ -197,6 +200,7 @@
     mlirContextIsRegisteredOperation;
     mlirContextLoadAllAvailableDialects;
     mlirContextSetAllowUnregisteredDialects;
+    mlirContextSetThreadPool;
     mlirCreateExternalPass;
     mlirDenseArrayAttrGetTypeID;
     mlirDenseArrayGetNumElements;
@@ -380,6 +384,8 @@
     mlirIntegerTypeUnsignedGet;
     mlirIsGlobalDebugEnabled;
     mlirLinalgFillBuiltinNamedOpRegion;
+    mlirLlvmThreadPoolCreate;
+    mlirLlvmThreadPoolDestroy;
     mlirLocationCallSiteGet;
     mlirLocationEqual;
     mlirLocationFileLineColGet;
@@ -467,6 +473,7 @@
     mlirOperationRemoveFromParent;
     mlirOperationSetAttributeByName;
     mlirOperationSetOperand;
+    mlirOperationSetOperands;
     mlirOperationStateAddAttributes;
     mlirOperationStateAddOperands;
     mlirOperationStateAddOwnedRegions;
@@ -507,6 +514,7 @@
     mlirRegionInsertOwnedBlock;
     mlirRegionInsertOwnedBlockAfter;
     mlirRegionInsertOwnedBlockBefore;
+    mlirRegionTakeBody;
     mlirRegisterLinalgPasses;
     mlirShapedTypeGetDimSize;
     mlirShapedTypeGetDynamicSize;
diff --git a/compiler/src/iree/compiler/API/api_exports.macos.lst b/compiler/src/iree/compiler/API/api_exports.macos.lst
index 1bc5e1c..07fb753 100644
--- a/compiler/src/iree/compiler/API/api_exports.macos.lst
+++ b/compiler/src/iree/compiler/API/api_exports.macos.lst
@@ -169,6 +169,7 @@
 _mlirBlockGetParentOperation
 _mlirBlockGetParentRegion
 _mlirBlockGetTerminator
+_mlirBlockInsertArgument
 _mlirBlockInsertOwnedOperation
 _mlirBlockInsertOwnedOperationAfter
 _mlirBlockInsertOwnedOperationBefore
@@ -184,6 +185,8 @@
 _mlirContextAppendDialectRegistry
 _mlirContextAttachDiagnosticHandler
 _mlirContextCreate
+_mlirContextCreateWithRegistry
+_mlirContextCreateWithThreading
 _mlirContextDestroy
 _mlirContextDetachDiagnosticHandler
 _mlirContextEnableMultithreading
@@ -195,6 +198,7 @@
 _mlirContextIsRegisteredOperation
 _mlirContextLoadAllAvailableDialects
 _mlirContextSetAllowUnregisteredDialects
+_mlirContextSetThreadPool
 _mlirCreateExternalPass
 _mlirDenseArrayAttrGetTypeID
 _mlirDenseArrayGetNumElements
@@ -378,6 +382,8 @@
 _mlirIntegerTypeUnsignedGet
 _mlirIsGlobalDebugEnabled
 _mlirLinalgFillBuiltinNamedOpRegion
+_mlirLlvmThreadPoolCreate
+_mlirLlvmThreadPoolDestroy
 _mlirLocationCallSiteGet
 _mlirLocationEqual
 _mlirLocationFileLineColGet
@@ -465,6 +471,7 @@
 _mlirOperationRemoveFromParent
 _mlirOperationSetAttributeByName
 _mlirOperationSetOperand
+_mlirOperationSetOperands
 _mlirOperationStateAddAttributes
 _mlirOperationStateAddOperands
 _mlirOperationStateAddOwnedRegions
@@ -505,6 +512,7 @@
 _mlirRegionInsertOwnedBlock
 _mlirRegionInsertOwnedBlockAfter
 _mlirRegionInsertOwnedBlockBefore
+_mlirRegionTakeBody
 _mlirRegisterLinalgPasses
 _mlirShapedTypeGetDimSize
 _mlirShapedTypeGetDynamicSize
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings.mlir
index 198ad3f..6f0f7a6 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings.mlir
@@ -27,7 +27,7 @@
       // CHECK-NEXT: util.optimization_barrier %[[SUBSPAN_A]]
       util.optimization_barrier %subspan_a : !flow.dispatch.tensor<readwrite:tensor<20xi8>>
 
-      // CHECK: %[[SUM_OFFSET_B:.+]] = arith.addi %c20, %[[OFFSET_B]]
+      // CHECK: %[[SUM_OFFSET_B:.+]] = arith.addi %[[OFFSET_B]], %c20
       // CHECK-NEXT: %[[SUBSPAN_B:.+]] = stream.binding.subspan %[[BINDING_B]][%[[SUM_OFFSET_B]]]
       %subspan_b = stream.binding.subspan %binding_b[%c20] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<20xi8>>{%c20}
       // CHECK-NEXT: util.optimization_barrier %[[SUBSPAN_B]]
@@ -109,13 +109,13 @@
       // CHECK-NEXT: util.optimization_barrier %[[SUBSPAN_A]]
       util.optimization_barrier %subspan_a : !flow.dispatch.tensor<readwrite:tensor<20xi8>>
 
-      // CHECK: %[[SUM_OFFSET_B:.+]] = arith.addi %c20, %[[OFFSET_B]]
+      // CHECK: %[[SUM_OFFSET_B:.+]] = arith.addi %[[OFFSET_B]], %c20
       // CHECK-NEXT: %[[SUBSPAN_B:.+]] = stream.binding.subspan %[[BINDING_B]][%[[SUM_OFFSET_B]]]
       %subspan_b = stream.binding.subspan %binding_b[%c20] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<20xi8>>{%c20}
       // CHECK-NEXT: util.optimization_barrier %[[SUBSPAN_B]]
       util.optimization_barrier %subspan_b : !flow.dispatch.tensor<readwrite:tensor<20xi8>>
 
-      // CHECK: %[[SUM_OFFSET_C:.+]] = arith.addi %c40, %[[OFFSET_C]]
+      // CHECK: %[[SUM_OFFSET_C:.+]] = arith.addi %[[OFFSET_C]], %c40
       // CHECK-NEXT: %[[SUBSPAN_C:.+]] = stream.binding.subspan %[[BINDING_A]][%[[SUM_OFFSET_C]]]
       %subspan_c = stream.binding.subspan %binding_c[%c40] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<20xi8>>{%c20}
       // CHECK-NEXT: util.optimization_barrier %[[SUBSPAN_C]]
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings_noalias.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings_noalias.mlir
index 686819f..1736232 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings_noalias.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/fuse_dispatch_bindings_noalias.mlir
@@ -25,13 +25,13 @@
       // CHECK-NEXT: util.optimization_barrier %[[SUBSPAN_A]]
       util.optimization_barrier %subspan_a : !flow.dispatch.tensor<readwrite:tensor<20xi8>>
 
-      // CHECK: %[[SUM_OFFSET_B:.+]] = arith.addi %c20, %[[OFFSET_B]]
+      // CHECK: %[[SUM_OFFSET_B:.+]] = arith.addi %[[OFFSET_B]], %c20
       // CHECK-NEXT: %[[SUBSPAN_B:.+]] = stream.binding.subspan %[[BINDING_A]][%[[SUM_OFFSET_B]]]
       %subspan_b = stream.binding.subspan %binding_b[%c20] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<20xi8>>{%c20}
       // CHECK-NEXT: util.optimization_barrier %[[SUBSPAN_B]]
       util.optimization_barrier %subspan_b : !flow.dispatch.tensor<readwrite:tensor<20xi8>>
 
-      // CHECK: %[[SUM_OFFSET_C:.+]] = arith.addi %c40, %[[OFFSET_C]]
+      // CHECK: %[[SUM_OFFSET_C:.+]] = arith.addi %[[OFFSET_C]], %c40
       // CHECK-NEXT: %[[SUBSPAN_C:.+]] = stream.binding.subspan %[[BINDING_C]][%[[SUM_OFFSET_C]]]
       %subspan_c = stream.binding.subspan %binding_c[%c40] : !stream.binding -> !flow.dispatch.tensor<readwrite:tensor<20xi8>>{%c20}
       // CHECK-NEXT: util.optimization_barrier %[[SUBSPAN_C]]
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/layout_slices.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/layout_slices.mlir
index 92737b4..30cf36d 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/layout_slices.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/layout_slices.mlir
@@ -50,7 +50,7 @@
   // CHECK-DAG: %c0 = arith.constant 0 : index
   // CHECK-DAG: %c16 = arith.constant 16 : index
   // CHECK-DAG: %0 = util.align %[[SIZE_A]], %c16 : index
-  // CHECK-DAG: %1 = arith.addi %c0, %0 : index
+  // CHECK-DAG: %1 = arith.addi %0, %c0 : index
   // CHECK-DAG: %2 = util.align %[[SIZE_B]], %c16 : index
   // CHECK-DAG: %3 = arith.addi %1, %2 : index
 
@@ -85,7 +85,7 @@
   // CHECK-DAG: %c16 = arith.constant 16 : index
   // CHECK-DAG: %c208 = arith.constant 208 : index
   // CHECK-DAG: %0 = util.align %[[SIZE_A]], %c16 : index
-  // CHECK-DAG: %1 = arith.addi %c208, %0 : index
+  // CHECK-DAG: %1 = arith.addi %0, %c208 : index
   // CHECK-DAG: %2 = util.align %[[SIZE_B]], %c16 : index
   // CHECK-DAG: %3 = arith.addi %1, %2 : index
 
diff --git a/llvm-external-projects/iree-dialects/BUILD.bazel b/llvm-external-projects/iree-dialects/BUILD.bazel
index b748c5a..77c5e34 100644
--- a/llvm-external-projects/iree-dialects/BUILD.bazel
+++ b/llvm-external-projects/iree-dialects/BUILD.bazel
@@ -43,9 +43,7 @@
         "include/iree-dialects/Dialect/LinalgExt/Passes/*.td",
         "include/iree-dialects/Dialect/LinalgTransform/*.td",
         "python/iree/compiler/dialects/*.td",
-    ]) + [
-        "@llvm-project//mlir:include/mlir/Bindings/Python/Attributes.td",
-    ],
+    ]),
     includes = ["include"],
     deps = [
         "@llvm-project//mlir:BuiltinDialectTdFiles",
diff --git a/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeInputBinding.td b/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeInputBinding.td
index eabf3c4..0b607b0 100644
--- a/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeInputBinding.td
+++ b/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeInputBinding.td
@@ -7,7 +7,6 @@
 #ifndef PYTHON_BINDINGS_IREE_OPS
 #define PYTHON_BINDINGS_IREE_OPS
 
-include "mlir/Bindings/Python/Attributes.td"
 include "iree-dialects/Dialect/Input/InputOps.td"
 
 #endif // PYTHON_BINDINGS_IREE_OPS
diff --git a/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeLinalgExtBinding.td b/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeLinalgExtBinding.td
index da2ceae..7502545 100644
--- a/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeLinalgExtBinding.td
+++ b/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeLinalgExtBinding.td
@@ -7,7 +7,6 @@
 #ifndef PYTHON_BINDINGS_IREE_LINALGEXT_OPS
 #define PYTHON_BINDINGS_IREE_LINALGEXT_OPS
 
-include "mlir/Bindings/Python/Attributes.td"
 include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.td"
 
 #endif // PYTHON_BINDINGS_IREE_LINALGEXT_OPS
diff --git a/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeStructuredTransformOps.td b/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeStructuredTransformOps.td
index 42c69e2..0a10cd3 100644
--- a/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeStructuredTransformOps.td
+++ b/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/IreeStructuredTransformOps.td
@@ -7,7 +7,6 @@
 #ifndef PYTHON_BINDINGS_IREE_TRANSFORMEXT_BINDING
 #define PYTHON_BINDINGS_IREE_TRANSFORMEXT_BINDING
 
-include "mlir/Bindings/Python/Attributes.td"
 include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.td"
 
 #endif // PYTHON_BINDINGS_IREE_TRANSFORMEXT_BINDING
diff --git a/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/LinalgTransformBinding.td b/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/LinalgTransformBinding.td
index e908e79..f606e8c 100644
--- a/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/LinalgTransformBinding.td
+++ b/llvm-external-projects/iree-dialects/python/iree/compiler/dialects/LinalgTransformBinding.td
@@ -7,7 +7,6 @@
 #ifndef PYTHON_BINDINGS_IREE_LINALGTRANSFORM_BINDING
 #define PYTHON_BINDINGS_IREE_LINALGTRANSFORM_BINDING
 
-include "mlir/Bindings/Python/Attributes.td"
 include "iree-dialects/Dialect/LinalgTransform/LinalgTransformOps.td"
 
 #endif // PYTHON_BINDINGS_IREE_LINALGTRANSFORM_BINDING
diff --git a/tests/e2e/test_artifacts/generated_e2e_test_fetch_models.cmake b/tests/e2e/test_artifacts/generated_e2e_test_fetch_models.cmake
index 5c53762..21b540a 100644
--- a/tests/e2e/test_artifacts/generated_e2e_test_fetch_models.cmake
+++ b/tests/e2e/test_artifacts/generated_e2e_test_fetch_models.cmake
@@ -202,13 +202,6 @@
 )
 
 iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch1"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH1/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
   NAME "model-EfficientNetV2Sfp16PT"
   SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/torch_models_20230522.846_1684831160/EFFICIENTNET_V2_S_FP16/batch_1/linalg.mlir"
   OUTPUT "${ROOT_ARTIFACTS_DIR}/model_EfficientNetV2Sfp16PT.mlir"
@@ -517,62 +510,6 @@
 )
 
 iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch16"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH16/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch16.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch24"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH24/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch24.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch32"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH32/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch32.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch48"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH48/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch48.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch64"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH64/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch64.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch512"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH512/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch512.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch1024"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH1024/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1024.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
-  NAME "model-BertLargefp16PTBatch1280"
-  SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/pytorch/pt_models_20230709.894_1688992116/BERT_LARGE_FP16_PT_384XI32_BATCH1280/linalg.mlirbc"
-  OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1280.mlirbc"
-  UNPACK
-)
-
-iree_fetch_artifact(
   NAME "model-BERT_LARGE_JAX_384XI32_BATCH1"
   SOURCE_URL "https://storage.googleapis.com/iree-model-artifacts/jax/jax_models_0.4.13_1688607404/BERT_LARGE_FP32_JAX_384XI32_BATCH1/stablehlo.mlirbc"
   OUTPUT "${ROOT_ARTIFACTS_DIR}/model_BERT_LARGE_JAX_384XI32_BATCH1.mlirbc"
diff --git a/tests/e2e/test_artifacts/generated_e2e_test_iree_artifacts.cmake b/tests/e2e/test_artifacts/generated_e2e_test_iree_artifacts.cmake
index ca68b79..22f1aee 100644
--- a/tests/e2e/test_artifacts/generated_e2e_test_iree_artifacts.cmake
+++ b/tests/e2e/test_artifacts/generated_e2e_test_iree_artifacts.cmake
@@ -731,18 +731,6 @@
 )
 
 iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch1_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch1(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
   NAME "iree-module-EfficientNetV2Sfp16PT_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
   SRC "${ROOT_ARTIFACTS_DIR}/model_EfficientNetV2Sfp16PT.mlir"
   MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_EfficientNetV2Sfp16PT_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
@@ -1393,102 +1381,6 @@
 )
 
 iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch16_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch16.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch16_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch16(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch24_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch24.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch24_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch24(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch32_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch32.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch32_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch32(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch48_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch48.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch48_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch48(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch64_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch64.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch64_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch64(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch512_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch512.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch512_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch512(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch1024_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1024.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1024_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch1024(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch1280_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1280.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1280_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-  FRIENDLY_NAME "BertLargefp16PTBatch1280(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags]"
-  PUBLIC
-)
-
-iree_bytecode_module(
   NAME "iree-module-BERT_LARGE_JAX_384XI32_BATCH1_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_"
   SRC "${ROOT_ARTIFACTS_DIR}/model_BERT_LARGE_JAX_384XI32_BATCH1.mlirbc"
   MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BERT_LARGE_JAX_384XI32_BATCH1_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_/module.vmfb"
@@ -3676,22 +3568,6 @@
 )
 
 iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch1_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch1(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
   NAME "iree-module-EfficientNetV2Sfp16PT_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
   SRC "${ROOT_ARTIFACTS_DIR}/model_EfficientNetV2Sfp16PT.mlir"
   MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_EfficientNetV2Sfp16PT_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
@@ -4554,134 +4430,6 @@
 )
 
 iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch16_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch16.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch16_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch16_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch16(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch24_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch24.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch24_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch24_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch24(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch32_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch32.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch32_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch32_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch32(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch48_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch48.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch48_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch48_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch48(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch64_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch64.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch64_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch64_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch64(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch512_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch512.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch512_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch512_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch512(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch1024_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1024.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1024_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1024_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch1024(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
-  NAME "iree-module-BertLargefp16PTBatch1280_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
-  SRC "${ROOT_ARTIFACTS_DIR}/model_BertLargefp16PTBatch1280.mlirbc"
-  MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1280_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
-  FLAGS
-    "--iree-hal-target-backends=cuda"
-    "--iree-input-type=none"
-    "--iree-hal-cuda-llvm-target-arch=sm_80"
-    "--iree-vm-emit-polyglot-zip=true"
-    "--iree-llvmcpu-debug-symbols=false"
-    "--iree-scheduling-dump-statistics-format=json"
-    "--iree-scheduling-dump-statistics-file=${ROOT_ARTIFACTS_DIR}/iree_module_BertLargefp16PTBatch1280_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/scheduling_stats.json"
-  FRIENDLY_NAME "BertLargefp16PTBatch1280(linalg) [cuda-sm_80-linux_gnu-cuda][default-flags,compile-stats]"
-  PUBLIC
-)
-
-iree_bytecode_module(
   NAME "iree-module-BERT_LARGE_JAX_384XI32_BATCH1_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_"
   SRC "${ROOT_ARTIFACTS_DIR}/model_BERT_LARGE_JAX_384XI32_BATCH1.mlirbc"
   MODULE_FILE_NAME "${ROOT_ARTIFACTS_DIR}/iree_module_BERT_LARGE_JAX_384XI32_BATCH1_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_/module.vmfb"
@@ -6423,7 +6171,6 @@
   ${PACKAGE_NAME}_iree-imported-model-PoseNet_fp32_tflite_
   ${PACKAGE_NAME}_model-BertForMaskedLMTF
   ${PACKAGE_NAME}_model-BertLargeTF
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch1
   ${PACKAGE_NAME}_model-ClipTextSeqLen64PT
   ${PACKAGE_NAME}_model-EfficientNetB7PT
   ${PACKAGE_NAME}_model-EfficientNetV2SPT
@@ -6471,14 +6218,6 @@
   ${PACKAGE_NAME}_model-BertLargeTFBatch48
   ${PACKAGE_NAME}_model-BertLargeTFBatch512
   ${PACKAGE_NAME}_model-BertLargeTFBatch64
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch1024
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch1280
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch16
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch24
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch32
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch48
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch512
-  ${PACKAGE_NAME}_model-BertLargefp16PTBatch64
   ${PACKAGE_NAME}_model-RESNET50_FP32_JAX_3X224X224XF32_BATCH1
   ${PACKAGE_NAME}_model-RESNET50_FP32_JAX_3X224X224XF32_BATCH128
   ${PACKAGE_NAME}_model-RESNET50_FP32_JAX_3X224X224XF32_BATCH2048
@@ -6592,7 +6331,6 @@
   ${PACKAGE_NAME}_iree-module-BertLargeTF_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-BertLargeTF_stablehlo___x86_64-cascadelake-linux_gnu-llvm_cpu__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-BertLargeTF_stablehlo___x86_64-cascadelake-linux_gnu-llvm_cpu__experimental-flags_data-tiling_ukernel_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-ClipTextSeqLen64PT_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-ClipTextSeqLen64PT_linalg___nvidia-ampere-vulkan_linux-vulkan_spirv__experimental-flags_tensorcore_compile-stats_
   ${PACKAGE_NAME}_iree-module-ClipTextSeqLen64PT_linalg___nvidia-pascal-vulkan_linux-vulkan_spirv__experimental-flags_simt_compile-stats_
@@ -6749,14 +6487,6 @@
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch512_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch64_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch64_stablehlo___x86_64-cascadelake-linux_gnu-llvm_cpu__default-flags_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1024_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1280_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch16_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch24_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch32_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch48_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch512_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch64_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH128_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH1_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH2048_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_compile-stats_
@@ -6805,7 +6535,6 @@
 add_dependencies(iree-benchmark-suites-cuda
   ${PACKAGE_NAME}_iree-module-BertForMaskedLMTF_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-BertLargeTF_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-ClipTextSeqLen64PT_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-EfficientNetB7PT_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-EfficientNetV2STF_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
@@ -6852,14 +6581,6 @@
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch48_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch512_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch64_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1024_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1280_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch16_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch24_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch32_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch48_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch512_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch64_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH128_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH1_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH2048_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
@@ -6905,7 +6626,6 @@
   ${PACKAGE_NAME}_iree-module-BertLargeTF_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-BertLargeTF_stablehlo___x86_64-cascadelake-linux_gnu-llvm_cpu__default-flags_
   ${PACKAGE_NAME}_iree-module-BertLargeTF_stablehlo___x86_64-cascadelake-linux_gnu-llvm_cpu__experimental-flags_data-tiling_ukernel_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-ClipTextSeqLen64PT_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-ClipTextSeqLen64PT_linalg___nvidia-ampere-vulkan_linux-vulkan_spirv__experimental-flags_tensorcore_
   ${PACKAGE_NAME}_iree-module-ClipTextSeqLen64PT_linalg___nvidia-pascal-vulkan_linux-vulkan_spirv__experimental-flags_simt_
@@ -7062,14 +6782,6 @@
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch512_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch64_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-BertLargeTFBatch64_stablehlo___x86_64-cascadelake-linux_gnu-llvm_cpu__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1024_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch1280_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch16_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch24_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch32_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch48_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch512_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
-  ${PACKAGE_NAME}_iree-module-BertLargefp16PTBatch64_linalg___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH128_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH1_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
   ${PACKAGE_NAME}_iree-module-RESNET50_FP32_JAX_3X224X224XF32_BATCH2048_stablehlo___cuda-sm_80-linux_gnu-cuda__default-flags_
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 37937b8..4706251 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 37937b8a040c5525a92c7ce23c329383e9d0c8e4
+Subproject commit 4706251a3186c34da0ee8fd894f7e6b095da8fdc