Serializing vector<...i8> to flatbuffers much more efficiently.
Still likely some work we can do here to avoid additional allocs/copies
but this at least removes the last-mile extremely slow iteration over each
byte/conversion to APInt/truncation that was happening before.
diff --git a/iree/compiler/Dialect/VM/Target/Bytecode/ConstantEncoder.cpp b/iree/compiler/Dialect/VM/Target/Bytecode/ConstantEncoder.cpp
index 612d3d0..180535a 100644
--- a/iree/compiler/Dialect/VM/Target/Bytecode/ConstantEncoder.cpp
+++ b/iree/compiler/Dialect/VM/Target/Bytecode/ConstantEncoder.cpp
@@ -36,11 +36,21 @@
 
 static Offset<Vector<uint8_t>> serializeConstantI8Array(
     DenseIntElementsAttr attr, FlatBufferBuilder &fbb) {
+  // vm.rodata and other very large constants end up as this; since i8 is i8
+  // everywhere (endianness doesn't matter when you have one byte :) we can
+  // directly access the data and memcpy.
   uint8_t *bytePtr = nullptr;
   auto byteVector =
       fbb.CreateUninitializedVector(attr.getNumElements() * 1, &bytePtr);
-  for (const APInt &value : attr.getIntValues()) {
-    *(bytePtr++) = value.extractBitsAsZExtValue(8, 0) & UINT8_MAX;
+  if (attr.isSplat()) {
+    // NOTE: this is a slow path and we should have eliminated it earlier on
+    // during constant op conversion.
+    for (const APInt &value : attr.getIntValues()) {
+      *(bytePtr++) = value.extractBitsAsZExtValue(8, 0) & UINT8_MAX;
+    }
+  } else {
+    auto rawData = attr.getRawData();
+    std::memcpy(bytePtr, rawData.data(), rawData.size());
   }
   return byteVector;
 }