Fix `vzip` stripmine instruction

PiperOrigin-RevId: 561748361
diff --git a/sim/kelvin_vector_instructions.cc b/sim/kelvin_vector_instructions.cc
index 48d484f..241789d 100644
--- a/sim/kelvin_vector_instructions.cc
+++ b/sim/kelvin_vector_instructions.cc
@@ -1283,17 +1283,20 @@
 template void KelvinVEvnodd<int32_t>(bool, bool, Instruction *);
 
 // Interleave even/odd lanes of two operands.
-// Returns odd elements of concatenated registers.
+// Returns even elements of concatenated registers.
 template <typename T>
 T VZipOpGetArg1(const Instruction *inst, bool scalar, int num_ops, int op_index,
                 int dst_element_index, int dst_reg_index) {
   auto state = static_cast<KelvinState *>(inst->state());
   const int vector_size_in_bytes = state->vector_length() / 8;
   const int elts_per_register = vector_size_in_bytes / sizeof(T);
+  const int half_elts_per_register = elts_per_register / 2;
 
-  auto src_element_index = op_index * elts_per_register +
+  // Only takes the even elements. For the stripmine version, the offset are
+  // counted as half of the register size.
+  auto src_element_index = op_index * half_elts_per_register +
                            dst_element_index / 2 +
-                           dst_reg_index * elts_per_register / 2;
+                           dst_reg_index * half_elts_per_register * num_ops;
 
   if (dst_element_index & 1) {
     return GetInstructionSource<T>(inst, 1, scalar ? 0 : src_element_index);
diff --git a/sim/test/kelvin_vector_instructions_test.cc b/sim/test/kelvin_vector_instructions_test.cc
index 7a11d44..4b057e3 100644
--- a/sim/test/kelvin_vector_instructions_test.cc
+++ b/sim/test/kelvin_vector_instructions_test.cc
@@ -1654,8 +1654,9 @@
     const std::vector<T> &vs1_value, int vs2_size, bool s2_scalar,
     const std::vector<T> &vs2_value, T rs2_value, bool halftype_op,
     bool vmvp_op) {
-  auto src_index =
-      op_num * vs1_size + element_index / 2 + dest_reg_sub_index * vs1_size / 2;
+  auto src_index = (op_num * vs1_size + element_index +
+                    dest_reg_sub_index * vs1_size * num_ops) /
+                   2;
 
   T arg1;
   if (element_index & 1) {