Add vector duplicate instruction support.

PiperOrigin-RevId: 558295664
diff --git a/sim/kelvin_memory.bin_fmt b/sim/kelvin_memory.bin_fmt
index 37b8154..f3105d6 100644
--- a/sim/kelvin_memory.bin_fmt
+++ b/sim/kelvin_memory.bin_fmt
@@ -113,6 +113,14 @@
   vstq_h_sp_xx_m  : KelvinV2ArgsType : func2 == 0b01'1110, vs2 != 0, sz == 0b01, m == 0b01, func1 == 0b111, form == 0b11;
   vstq_w_sp_xx_m  : KelvinV2ArgsType : func2 == 0b01'1110, vs2 != 0, sz == 0b10, m == 0b01, func1 == 0b111, form == 0b11;
 
+  // vdup
+  vdup_b_x    : KelvinV2ArgsType : func2 == 0b01'0000, vs1 == 0, sz == 0b00, m == 0b00, func1 == 0b111, form == 0b11;
+  vdup_h_x    : KelvinV2ArgsType : func2 == 0b01'0000, vs1 == 0, sz == 0b01, m == 0b00, func1 == 0b111, form == 0b11;
+  vdup_w_x    : KelvinV2ArgsType : func2 == 0b01'0000, vs1 == 0, sz == 0b10, m == 0b00, func1 == 0b111, form == 0b11;
+  vdup_b_x_m  : KelvinV2ArgsType : func2 == 0b01'0000, vs1 == 0, sz == 0b00, m == 0b01, func1 == 0b111, form == 0b11;
+  vdup_h_x_m  : KelvinV2ArgsType : func2 == 0b01'0000, vs1 == 0, sz == 0b01, m == 0b01, func1 == 0b111, form == 0b11;
+  vdup_w_x_m  : KelvinV2ArgsType : func2 == 0b01'0000, vs1 == 0, sz == 0b10, m == 0b01, func1 == 0b111, form == 0b11;
+
   // vcget
   vcget           : KelvinV2ArgsType : func2 == 0b01'0100, vs2 == 0, vs1 == 0, vd == 48, func1 == 0b111, form == 0b11;
 
diff --git a/sim/kelvin_memory.isa b/sim/kelvin_memory.isa
index 88a380e..bc83d3e 100644
--- a/sim/kelvin_memory.isa
+++ b/sim/kelvin_memory.isa
@@ -481,6 +481,24 @@
     vstq_w_sp_xx_m{: vd, vs1, vs2 : vs1},
       disasm: "vstq.w.s.xx.m", "%vd, %vs1, %vs2",
       semfunc: "absl::bind_front(&KelvinVStQ<int32_t>, /*strip_mine*/ true)";
+    vdup_b_x{: vs2 : vd },
+      disasm: "vdup.b.x", "%vd, %vs2",
+      semfunc: "absl::bind_front(&KelvinVDup<int8_t>, /*strip_mine*/ false)";
+    vdup_h_x{: vs2 : vd },
+      disasm: "vdup.h.x", "%vd, %vs2",
+      semfunc: "absl::bind_front(&KelvinVDup<int16_t>, /*strip_mine*/ false)";
+    vdup_w_x{: vs2 : vd },
+      disasm: "vdup.w.x", "%vd, %vs2",
+      semfunc: "absl::bind_front(&KelvinVDup<int32_t>, /*strip_mine*/ false)";
+    vdup_b_x_m{: vs2 : vd },
+      disasm: "vdup.b.x.m", "%vd, %vs2",
+      semfunc: "absl::bind_front(&KelvinVDup<int8_t>, /*strip_mine*/ true)";
+    vdup_h_x_m{: vs2 : vd },
+      disasm: "vdup.h.x.m", "%vd, %vs2",
+      semfunc: "absl::bind_front(&KelvinVDup<int16_t>, /*strip_mine*/ true)";
+    vdup_w_x_m{: vs2 : vd },
+      disasm: "vdup.w.x.m", "%vd, %vs2",
+      semfunc: "absl::bind_front(&KelvinVDup<int32_t>, /*strip_mine*/ true)";
 
     // vcget
     vcget{: : vd},
diff --git a/sim/kelvin_vector_memory_instructions.cc b/sim/kelvin_vector_memory_instructions.cc
index c595a20..67c0966 100644
--- a/sim/kelvin_vector_memory_instructions.cc
+++ b/sim/kelvin_vector_memory_instructions.cc
@@ -257,6 +257,34 @@
 template void KelvinVSt<int16_t>(bool, bool, bool, Instruction *);
 template void KelvinVSt<int32_t>(bool, bool, bool, Instruction *);
 
+// Duplicate a scalar value into a vector register.
+template <typename T>
+void KelvinVDup(bool strip_mine, Instruction *inst) {
+  auto *state = static_cast<KelvinState *>(inst->state());
+  const int vector_size_in_bytes = state->vector_length() / 8;
+  const uint32_t elts_per_register = vector_size_in_bytes / sizeof(T);
+  const auto num_ops = strip_mine ? 4 : 1;
+
+  // Gets destination register and scalar value.
+  auto *vd = static_cast<RV32VectorDestinationOperand *>(inst->Destination(0));
+  auto value = GetInstructionSource<T>(inst, 0);
+
+  // Fill destination buffer and write to register.
+  for (int op_index = 0; op_index < num_ops; ++op_index) {
+    DataBuffer *dest_db = vd->AllocateDataBuffer(op_index);
+    absl::Span<T> dest_span = dest_db->template Get<T>();
+    for (int dst_element_index = 0; dst_element_index < elts_per_register;
+         ++dst_element_index) {
+      dest_span[dst_element_index] = value;
+    }
+    dest_db->Submit();
+  }
+}
+
+template void KelvinVDup<int8_t>(bool, Instruction *);
+template void KelvinVDup<int16_t>(bool, Instruction *);
+template void KelvinVDup<int32_t>(bool, Instruction *);
+
 template <typename T>
 void KelvinVStQ(bool strip_mine, Instruction *inst) {
   VectorStoreHelper<T>(/*has_length=*/false, /*has_stride=*/true, strip_mine,
diff --git a/sim/kelvin_vector_memory_instructions.h b/sim/kelvin_vector_memory_instructions.h
index ff53da4..19f73db 100644
--- a/sim/kelvin_vector_memory_instructions.h
+++ b/sim/kelvin_vector_memory_instructions.h
@@ -19,6 +19,9 @@
                Instruction *inst);
 
 template <typename T>
+void KelvinVDup(bool strip_mine, Instruction *inst);
+
+template <typename T>
 void KelvinVStQ(bool strip_mine, Instruction *inst);
 
 template <typename T>
diff --git a/sim/test/kelvin_vector_memory_instructions_test.cc b/sim/test/kelvin_vector_memory_instructions_test.cc
index 49d058d..82d986c 100644
--- a/sim/test/kelvin_vector_memory_instructions_test.cc
+++ b/sim/test/kelvin_vector_memory_instructions_test.cc
@@ -27,6 +27,7 @@
 using kelvin::sim::KelvinADwInit;
 using kelvin::sim::KelvinGetVl;
 using kelvin::sim::KelvinVcGet;
+using kelvin::sim::KelvinVDup;
 using kelvin::sim::KelvinVLd;
 using kelvin::sim::KelvinVLdRegWrite;
 using kelvin::sim::KelvinVSt;
@@ -339,6 +340,62 @@
     StoreQuadOpTestHelper<TNext1, TNext...>(name);
   }
 
+  template <typename T>
+  void VdupOpTestHelper() {
+    for (auto strip_mine : {false, true}) {
+      // Setup instruction and its operands.
+      auto instruction = CreateInstruction();
+      const auto name_with_type =
+          absl::StrCat('VDup', KelvinTestTypeSuffix<T>());
+      auto subname = absl::StrCat(name_with_type, strip_mine ? "M" : "");
+      const uint32_t num_ops = strip_mine ? 4 : 1;
+      AppendRegisterOperands(instruction.get(), {kelvin::sim::test::kRs1Name},
+                             {});
+      AppendVectorRegisterOperands(
+          instruction.get(), num_ops, 1 /* src1_widen_factor */, {}, {},
+          false /* widen_dst */, {kelvin::sim::test::kVd});
+
+      instruction->set_semantic_function(
+          absl::bind_front(&KelvinVDup<T>, strip_mine));
+
+      // Sets operand scalar register value.
+      constexpr uint32_t value = 0x12345678;
+      SetRegisterValues<uint32_t>({{kelvin::sim::test::kRs1Name, value}});
+
+      // Fills vector registers with random values.
+      const uint32_t vector_length_in_bytes = state_->vector_length() / 8;
+      const uint32_t vd_size = vector_length_in_bytes / sizeof(T);
+      std::vector<T> vd_value(vector_length_in_bytes / sizeof(T) * num_ops);
+      auto vd_span = absl::Span<T>(vd_value);
+      FillArrayWithRandomValues<T>(vd_span);
+      for (int i = 0; i < num_ops; i++) {
+        auto vd_name = absl::StrCat("v", kelvin::sim::test::kVd + i);
+        SetVectorRegisterValues<T>(
+            {{vd_name, vd_span.subspan(vd_size * i, vd_size)}});
+      }
+
+      instruction->Execute();
+
+      // Verifies all register elements is equal to scalar value.
+      for (int op_num = 0; op_num < num_ops; op_num++) {
+        auto vreg_num = kelvin::sim::test::kVd + op_num;
+        auto test_vreg = vreg_[vreg_num];
+        auto vreg_span = test_vreg->data_buffer()->Get<T>();
+        for (int element_index = 0; element_index < vd_size; element_index++) {
+          EXPECT_EQ(static_cast<T>(value), vreg_span[element_index])
+              << absl::StrCat(subname, ": vreg[", vreg_num, "][", element_index,
+                              "] mismatch");
+        }
+      }
+    }
+  }
+
+  template <typename T1, typename TNext1, typename... TNext>
+  void VdupOpTestHelper() {
+    VdupOpTestHelper<T1>();
+    VdupOpTestHelper<TNext1, TNext...>();
+  }
+
  protected:
   template <typename T>
   T GetDefaultMemoryValue(int address) {
@@ -374,6 +431,10 @@
   StoreQuadOpTestHelper<int8_t, int16_t, int32_t>("VStQ");
 }
 
+TEST_F(KelvinVectorMemoryInstructionsTest, VDup) {
+  VdupOpTestHelper<int8_t, int16_t, int32_t>();
+}
+
 class KelvinGetVlInstructionTest
     : public kelvin::sim::test::KelvinVectorInstructionsTestBase {
  public: