Kelvin dwconv instructions.

PiperOrigin-RevId: 565407892
diff --git a/sim/kelvin_conv.bin_fmt b/sim/kelvin_conv.bin_fmt
index 9526fdd..627262d 100644
--- a/sim/kelvin_conv.bin_fmt
+++ b/sim/kelvin_conv.bin_fmt
@@ -1,4 +1,8 @@
 instruction group KelvinVectorConvInst[32] : KelvinV3ArgsType {
   // vconv
   aconv_vxv      : KelvinV3ArgsType : func3_hi == 0b10, func3_lo == 0b00, vd == 48, vs1_low4 == 0, vs2 != 0, vs3_low3 == 0, m == 0, form == 0b101;
+
+  // vdwconv
+  adwconv_vxv    : KelvinV3ArgsType : func3_hi == 0b10, func3_lo == 0b10, vd_low2 == 0b00, vs2_hi1 == 0b1, m == 0, form == 0b101;
+  vdwconv_vxv    : KelvinV3ArgsType : func3_hi == 0b10, func3_lo == 0b10, vd_low2 == 0b00, vs2_hi1 == 0b0, m == 0, form == 0b101;
 };
diff --git a/sim/kelvin_encoding.cc b/sim/kelvin_encoding.cc
index 5b68af7..5597833 100644
--- a/sim/kelvin_encoding.cc
+++ b/sim/kelvin_encoding.cc
@@ -212,6 +212,11 @@
               absl::StrCat(mpact::sim::riscv::RiscVState::kXregPrefix, reg_num),
               mpact::sim::riscv::kXRegisterAliases[reg_num]);
         }
+        if (opcode_ == OpcodeEnum::kAdwconvVxv ||
+            opcode_ == OpcodeEnum::kVdwconvVxv) {
+          return GetVectorRegisterSourceOp<mpact::sim::riscv::RVVectorRegister>(
+              state_, reg_num, /*strip_mine*/ false, /*widen_factor*/ 9);
+        }
         if (opcode_ == OpcodeEnum::kAdwinit) {
           // Borrow the strip_mine setting to set 4x registers.
           strip_mine = true;
@@ -295,7 +300,8 @@
       [this](int latency) -> DestinationOperandInterface * {
         auto reg_num = encoding::kelvin_v2_args_type::ExtractVd(inst_word_);
         bool strip_mine = encoding::kelvin_v2_args_type::ExtractM(inst_word_);
-        if (opcode_ == OpcodeEnum::kVcget || opcode_ == OpcodeEnum::kAdwinit) {
+        if (opcode_ == OpcodeEnum::kVcget || opcode_ == OpcodeEnum::kAdwinit ||
+            opcode_ == OpcodeEnum::kVdwconvVxv) {
           // Borrow the strip_mine setting to set 4x/8x registers although it is
           // not part of the encoding.
           strip_mine = true;
diff --git a/sim/kelvin_format.bin_fmt b/sim/kelvin_format.bin_fmt
index a8bc126..bf34e6f 100644
--- a/sim/kelvin_format.bin_fmt
+++ b/sim/kelvin_format.bin_fmt
@@ -130,6 +130,9 @@
     unsigned func3_lo[2];
     unsigned form[3];  // .vvv=0b001, .vxv=0b101.
   overlays:
+    unsigned vs2_hi1[1] = vs2[5];
+    unsigned rs2[5] = vs2[4..0];
     unsigned vs1_low4[4] = vs1[3..0];
     unsigned vs3_low3[3] = vs3[2..0];
+    unsigned vd_low2[2] = vd[1..0];
 };
diff --git a/sim/kelvin_mul.isa b/sim/kelvin_mul.isa
index cb2fa73..7d9ef03 100644
--- a/sim/kelvin_mul.isa
+++ b/sim/kelvin_mul.isa
@@ -537,5 +537,13 @@
     aconv_vxv{: vs1, vs2, vs3 : vd},
       disasm: "aconv.vxv", "%vd, %vs1, %vs2, %vs3",
       semfunc: "&KelvinVConv";
+    // adwconv
+    adwconv_vxv{: vs1, rs2, vs3 : vd},
+      disasm: "adwconv.vxv", "%vd, %vs1, %rs2, %vs3",
+      semfunc: "absl::bind_front(&KelvinVDwconv, /*write_acc*/ false)";
+    // vdwconv
+    vdwconv_vxv{: vs1, rs2, vs3 : vd},
+      disasm: "adwconv.vxv", "%vd, %vs1, %rs2, %vs3",
+      semfunc: "absl::bind_front(&KelvinVDwconv, /*write_acc*/ true)";
   }
 }
diff --git a/sim/kelvin_state.h b/sim/kelvin_state.h
index 52d271b..2c09e99 100644
--- a/sim/kelvin_state.h
+++ b/sim/kelvin_state.h
@@ -29,6 +29,8 @@
 
 using AccArrayType = AccArrayTemplate<uint32_t>;
 
+using DwAccArray = std::array<uint32_t, 32>;
+
 class KelvinState : public mpact::sim::riscv::RiscVState {
  public:
   KelvinState(absl::string_view id, mpact::sim::riscv::RiscVXlen xlen,
@@ -52,6 +54,10 @@
   AccArrayType *acc_vec(int index) { return &(acc_register_[index]); }
   AccArrayTemplate<AccArrayType> acc_register() const { return acc_register_; }
 
+  uint32_t *dw_acc_vec(int i) { return &depthwise_acc_register_[i]; }
+  DwAccArray &dw_acc_register() { return depthwise_acc_register_; }
+  const DwAccArray &dw_acc_register() const { return depthwise_acc_register_; }
+
   void SetLogArgs(std::any data) { log_args_.emplace_back(std::move(data)); }
   std::string *clog_string() { return &clog_string_; }
   void PrintLog(absl::string_view format_string);
@@ -75,6 +81,9 @@
 
   // Convolution accumulation register, set to be uint32[VLENW][VLENW].
   AccArrayTemplate<AccArrayType> acc_register_;
+
+  // Depthwise convolution accumulation register.
+  DwAccArray depthwise_acc_register_;
 };
 
 }  // namespace kelvin::sim
diff --git a/sim/kelvin_vector_convolution_instructions.cc b/sim/kelvin_vector_convolution_instructions.cc
index 4557362..9f4d006 100644
--- a/sim/kelvin_vector_convolution_instructions.cc
+++ b/sim/kelvin_vector_convolution_instructions.cc
@@ -13,11 +13,17 @@
 #include "mpact/sim/generic/type_helpers.h"
 
 namespace kelvin::sim {
+namespace {
+constexpr int kVectorLenInByte = kVectorLengthInBits / 8;
+constexpr int kVectorLenInWord = kVectorLenInByte / sizeof(uint32_t);
+constexpr int kDwRegisterProducts = 3;
+}  // namespace
 
+using ::mpact::sim::generic::DataBuffer;
 using ::mpact::sim::generic::operator*;  // NOLINT: is used below (clang error).
-
-using mpact::sim::generic::GetInstructionSource;
-using mpact::sim::riscv::RV32VectorSourceOperand;
+using ::mpact::sim::generic::GetInstructionSource;
+using ::mpact::sim::riscv::RV32VectorDestinationOperand;
+using ::mpact::sim::riscv::RV32VectorSourceOperand;
 
 // Implement the 3-arg vector convolution (im2col + matmul)
 // vs1 (narrow) represents the starting register of 8 vector registers
@@ -26,8 +32,6 @@
 // `vd` is not used in the op.
 void KelvinVConv(Instruction *inst) {
   auto state = static_cast<KelvinState *>(inst->state());
-  constexpr int kVectorLenInByte = kVectorLengthInBits / 8;
-  constexpr int kVectorLenInWord = kVectorLenInByte / sizeof(uint32_t);
 
   vconv_cmd_t conv_cmd;
   auto reg_data = GetInstructionSource<uint32_t>(inst, 1, 0);
@@ -111,4 +115,128 @@
   }
 }
 
+// Implements accumulation of 3 32-element 8bit*8bit Hadamard products.
+// vs1 is the starting register of 9 vector activation registers, of which
+//     three are selected.
+// vs3 (wide) is the starting register of group of 3 vector registers.
+// xs2 stores the convolution command.
+// `vd` is used if |write_acc| is set to true.
+void KelvinVDwconv(bool write_acc, Instruction *inst) {
+  KelvinState *state = static_cast<KelvinState *>(inst->state());
+  uint32_t reg_data = GetInstructionSource<uint32_t>(inst, 1, 0);
+  vdwconv_u8_t dwconv_cmd;
+  memcpy(&dwconv_cmd, &reg_data, sizeof(dwconv_cmd));
+
+  int vs1_idx[3];
+  switch (dwconv_cmd.regbase) {
+    case 0:
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+    case 6:
+      vs1_idx[0] = dwconv_cmd.regbase;
+      vs1_idx[1] = dwconv_cmd.regbase + 1;
+      vs1_idx[2] = dwconv_cmd.regbase + 2;
+      break;
+    case 7:
+      vs1_idx[0] = 1;
+      vs1_idx[1] = 0;
+      vs1_idx[2] = 2;
+      break;
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+      vs1_idx[0] = (2 * dwconv_cmd.regbase) - 15;
+      vs1_idx[1] = vs1_idx[0] + 1;
+      vs1_idx[2] = 0;
+      break;
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+      vs1_idx[0] = (2 * dwconv_cmd.regbase) - 22;
+      vs1_idx[1] = 0;
+      vs1_idx[2] = 1;
+      break;
+  }
+
+  auto vs1 = static_cast<RV32VectorSourceOperand *>(inst->Source(0));
+  absl::Span<uint32_t> vs10_span =
+      vs1->GetRegister(vs1_idx[0])->data_buffer()->Get<uint32_t>();
+  absl::Span<uint32_t> vs11_span =
+      vs1->GetRegister(vs1_idx[1])->data_buffer()->Get<uint32_t>();
+  absl::Span<uint32_t> vs12_span =
+      vs1->GetRegister(vs1_idx[2])->data_buffer()->Get<uint32_t>();
+  uint32_t a_data[kDwRegisterProducts * kVectorLenInWord];
+  switch (dwconv_cmd.sparsity) {
+    case 0:
+      memcpy(a_data, vs10_span.data(), 8 * sizeof(uint32_t));
+      memcpy(a_data + 8, vs11_span.data(), 8 * sizeof(uint32_t));
+      memcpy(a_data + 16, vs12_span.data(), 8 * sizeof(uint32_t));
+      break;
+    case 1:
+      a_data[0] = vs10_span[7];
+      memcpy(a_data + 1, vs11_span.data(), 7 * sizeof(uint32_t));
+      memcpy(a_data + 8, vs11_span.data(), 8 * sizeof(uint32_t));
+      memcpy(a_data + 16, vs11_span.data() + 1, 7 * sizeof(uint32_t));
+      a_data[23] = vs12_span[0];
+      break;
+    case 2:
+      memcpy(a_data, vs10_span.data(), 8 * sizeof(uint32_t));
+      memcpy(a_data + 8, vs10_span.data() + 1, 7 * sizeof(uint32_t));
+      a_data[15] = vs11_span[0];
+      memcpy(a_data + 16, vs10_span.data() + 2, 6 * sizeof(uint32_t));
+      a_data[22] = vs11_span[0];
+      a_data[23] = vs11_span[1];
+      break;
+    default:
+      // Invalid state enum
+      state->Trap(/*is_interrupt=*/false, /*trap_value=*/0,
+                  *mpact::sim::riscv::ExceptionCode::kIllegalInstruction,
+                  /*epc=*/inst->address(), inst);
+  }
+
+  auto vs3 = static_cast<RV32VectorSourceOperand *>(inst->Source(2));
+  int32_t *acc = reinterpret_cast<int32_t *>(state->dw_acc_vec(0));
+
+  for (int r = 0; r < kDwRegisterProducts; r++) {
+    absl::Span<uint8_t> a_span = absl::Span<uint8_t>(
+        reinterpret_cast<uint8_t *>(a_data + (r * kVectorLenInWord)),
+        kVectorLenInByte);
+    absl::Span<uint8_t> b_span =
+        vs3->GetRegister(r)->data_buffer()->Get<uint8_t>();
+
+    for (int i = 0; i < kVectorLenInByte; i++) {
+      int32_t a =
+          dwconv_cmd.sdata1 ? static_cast<int8_t>(a_span[i]) : a_span[i];
+      int32_t b =
+          dwconv_cmd.sdata2 ? static_cast<int8_t>(b_span[i]) : b_span[i];
+      a += dwconv_cmd.sbias1;
+      b += dwconv_cmd.sbias2;
+
+      constexpr static int interleave[4] = {0, 2, 1, 3};
+      int acc_reg = interleave[(i & 0b11)];
+      int reg_offset = i >> 2;
+      acc[kVectorLenInWord * acc_reg + reg_offset] += a * b;
+    }
+  }
+
+  if (!write_acc) {
+    return;
+  }
+
+  auto vd = static_cast<RV32VectorDestinationOperand *>(inst->Destination(0));
+  for (int i = 0; i < 4; i++) {
+    DataBuffer *dest_db = vd->AllocateDataBuffer(i);
+    absl::Span<uint32_t> dest_span = dest_db->Get<uint32_t>();
+    for (int j = 0; j < kVectorLenInWord; j++) {
+      dest_span[j] = acc[i * kVectorLenInWord + j];
+    }
+    dest_db->Submit();
+  }
+}
+
 }  // namespace kelvin::sim
diff --git a/sim/kelvin_vector_convolution_instructions.h b/sim/kelvin_vector_convolution_instructions.h
index 6d08f98..23d01c7 100644
--- a/sim/kelvin_vector_convolution_instructions.h
+++ b/sim/kelvin_vector_convolution_instructions.h
@@ -20,8 +20,22 @@
   uint32_t sdata2 : 1;  // 0
 } vconv_cmd_t;
 
+// Command structure for the depthwise convolution instruction.
+typedef struct KelvinVDwconvCmd {
+  uint32_t mode : 2;      // 1:0
+  uint32_t sparsity : 2;  // 3:2
+  uint32_t regbase : 4;   // 7:4
+  uint32_t rsvd : 4;      // 11:8
+  int32_t sbias1 : 9;     // 20:12
+  uint32_t sdata1 : 1;    // 21
+  int32_t sbias2 : 9;     // 30:22
+  uint32_t sdata2 : 1;    // 31
+} vdwconv_u8_t;
+
 void KelvinVConv(Instruction *inst);
 
+void KelvinVDwconv(bool write_acc, Instruction *inst);
+
 }  // namespace kelvin::sim
 
 #endif  // SIM_KELVIN_VECTOR_CONVOLUTION_INSTRUCTIONS_H_
diff --git a/sim/kelvin_vector_memory_instructions.cc b/sim/kelvin_vector_memory_instructions.cc
index fd1f099..84069a8 100644
--- a/sim/kelvin_vector_memory_instructions.cc
+++ b/sim/kelvin_vector_memory_instructions.cc
@@ -379,6 +379,12 @@
   auto vd = static_cast<RV32VectorDestinationOperand *>(inst->Destination(0));
   for (int op_index = 0; op_index < kInitSize; ++op_index) {
     auto source_span = vs->GetRegister(op_index)->data_buffer()->Get<uint8_t>();
+    uint8_t *dwacc_span =
+        reinterpret_cast<uint8_t *>(state->dw_acc_vec(8 * op_index));
+    for (int i = 0; i < 32; i++) {
+      dwacc_span[i] = source_span[i];
+    }
+
     DataBuffer *dest_db = vd->AllocateDataBuffer(op_index);
     absl::Span<uint8_t> dest_span = dest_db->Get<uint8_t>();
     for (int i = 0; i < init_n; ++i) {
diff --git a/sim/test/BUILD b/sim/test/BUILD
index 7db1c7b..816ee4d 100644
--- a/sim/test/BUILD
+++ b/sim/test/BUILD
@@ -136,6 +136,7 @@
     deps = [
         ":kelvin_vector_instructions_test_base",
         "//sim:kelvin_instructions",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
diff --git a/sim/test/kelvin_encoding_test.cc b/sim/test/kelvin_encoding_test.cc
index 96c4500..8e5285d 100644
--- a/sim/test/kelvin_encoding_test.cc
+++ b/sim/test/kelvin_encoding_test.cc
@@ -100,6 +100,7 @@
 constexpr uint32_t kVAccsBase = 0b001010'000001'000000'00'001000'0'100'00;
 constexpr uint32_t kVAddBase = 0b000000'000000'000001'00'000010'0'000'00;
 constexpr uint32_t kAconvBase = 0b001000'000001'010000'10'110000'0'00'101;
+constexpr uint32_t kVdwconvBase = 0b001000'000001'010000'10'110000'0'10'101;
 
 class KelvinEncodingTest : public testing::Test {
  protected:
@@ -497,6 +498,12 @@
       kVAddBase, OpcodeEnum::kVaddBVv, SourceOpEnum::kVs1);
   EXPECT_EQ(v_src->size(), 1);
   delete v_src;
+
+  // Test vdwconv.vxv
+  v_src = EncodeOpHelper<RV32VectorSourceOperand>(
+      kVdwconvBase, OpcodeEnum::kVdwconvVxv, SourceOpEnum::kVs1);
+  EXPECT_EQ(v_src->size(), 9);
+  delete v_src;
 }
 
 TEST_F(KelvinEncodingTest, KelvinWideningVd) {
@@ -588,6 +595,12 @@
                                                  DestOpEnum::kVd);
   EXPECT_EQ(v_dest->size(), 8);
   delete v_dest;
+
+  // Test vdwconv
+  v_dest = EncodeOpHelper<RV32VectorDestOperand>(
+      kVdwconvBase, OpcodeEnum::kVdwconvVxv, DestOpEnum::kVd);
+  EXPECT_EQ(v_dest->size(), 4);
+  delete v_dest;
 }
 
 TEST_F(KelvinEncodingTest, KelvinEncodeVs3) {
diff --git a/sim/test/kelvin_vector_convolution_instructions_test.cc b/sim/test/kelvin_vector_convolution_instructions_test.cc
index e064785..80c98c1 100644
--- a/sim/test/kelvin_vector_convolution_instructions_test.cc
+++ b/sim/test/kelvin_vector_convolution_instructions_test.cc
@@ -4,11 +4,13 @@
 #include <cstdint>
 #include <cstring>
 #include <functional>
+#include <type_traits>
 #include <vector>
 
 #include "sim/test/kelvin_vector_instructions_test_base.h"
 #include "sim/test/testfiles/kelvin_vector_convolution_testdata.h"
 #include "googletest/include/gtest/gtest.h"
+#include "absl/functional/bind_front.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "riscv/riscv_state.h"
@@ -20,10 +22,210 @@
 
 // Semantic functions.
 using kelvin::sim::KelvinVConv;
+using kelvin::sim::KelvinVDwconv;
 
 class KelvinVectorConvolutionInstructionsTest
     : public kelvin::sim::test::KelvinVectorInstructionsTestBase {
  protected:
+  // Write [1-32] into register, accounting for internal dwconv swizzle
+  template <typename T>
+  void SetRegisterAscending(int reg, T offset) {
+    std::vector<T> data(32);
+    for (uint32_t i = 0; i < data.size(); i++) {
+      uint32_t reg;
+      switch ((i >> 3) & 0b11) {
+        case 0:
+          reg = 0;
+          break;
+        case 1:
+          reg = 2;
+          break;
+        case 2:
+          reg = 1;
+          break;
+        case 3:
+          reg = 3;
+          break;
+      }
+      uint32_t pos = i & 0b111;
+      uint32_t target = (pos << 2) | reg;
+      data[target] = i + offset;
+    }
+
+    auto reg_name = absl::StrCat("v", reg);
+    SetVectorRegisterValues<T>({{reg_name, absl::Span<T>(data)}});
+  }
+
+  template <typename T>
+  void SetRegisterConstant(int reg, T val) {
+    std::vector<T> data(32, val);
+    auto reg_name = absl::StrCat("v", reg);
+    SetVectorRegisterValues<T>({{reg_name, absl::Span<T>(data)}});
+  }
+
+  void ResetDwAccumulator() { state_->dw_acc_register().fill(0); }
+
+  template <bool kWriteAcc = true>
+  void ExecuteDwconv(bool expect_fail = false) {
+    constexpr int kVs1 = 0;
+    constexpr int kVs3 = 16;
+    constexpr int kVd = 48;
+    InstructionPtr instruction = CreateInstruction();
+    instruction->set_semantic_function(
+        absl::bind_front(KelvinVDwconv, kWriteAcc));
+    AppendVectorRegisterOperands(instruction.get(), 1, 9, kVs1, {},
+                                 false /* widen_dst*/, {});
+    AppendRegisterOperands(instruction.get(), {kelvin::sim::test::kRs2Name},
+                           {});
+    AppendVectorRegisterOperands(instruction.get(), 1, 3, kVs3, {},
+                                 false /* widen_dst*/, {});
+    if (kWriteAcc) {
+      std::vector<kelvin::sim::test::RegisterBase *> reg_vec;
+      for (int i = 0; i < 4; i++) {
+        auto reg_name = absl::StrCat("v", kVd + i);
+        reg_vec.push_back(
+            state_->GetRegister<kelvin::sim::test::RVVectorRegister>(reg_name)
+                .first);
+      }
+      auto *op = new kelvin::sim::test::RV32VectorDestinationOperand(
+          absl::Span<kelvin::sim::test::RegisterBase *>(reg_vec), 0,
+          absl::StrCat("v", kVd));
+      instruction->AppendDestination(op);
+    }
+
+    execution_fail_ = false;
+    state_->set_on_trap(trap_call_back_);
+    instruction->Execute();
+    EXPECT_EQ(expect_fail, execution_fail_);
+  }
+
+  template <bool kWriteAcc = true>
+  void TestAccumulatorAndRegisters(
+      std::function<void(int /*index*/, int32_t /*value*/)> f) {
+    constexpr int kVd = 48;
+
+    // Check internal accumulator.
+    auto acc_vec = state_->dw_acc_register();
+    for (int i = 0; i < 32; i++) {
+      f(i, acc_vec[i]);
+    }
+
+    // Check Registers
+    if (kWriteAcc) {
+      for (int r = 0; r < 4; r++) {
+        auto reg = state_
+                       ->GetRegister<kelvin::sim::test::RVVectorRegister>(
+                           absl::StrCat("v", kVd + r))
+                       .first;
+        auto reg_data = reg->data_buffer()->Get<int32_t>();
+        for (int elem = 0; elem < 8; elem++) {
+          int i = (r * 8) + elem;
+          int32_t value = reg_data[elem];
+          f(i, value);
+        }
+      }
+    }
+  }
+
+  void DepthwiseConvolutionBiasTestHelper(uint32_t sbias1, uint32_t sbias2) {
+    constexpr int kVs1 = 0;
+    constexpr int kVs3 = 16;
+
+    kelvin::sim::vdwconv_u8_t dwconv_cmd;
+    memset(&dwconv_cmd, 0, sizeof(dwconv_cmd));
+    dwconv_cmd.sdata1 = 1;
+    dwconv_cmd.sdata2 = 1;
+    dwconv_cmd.sbias1 = sbias1;
+    dwconv_cmd.sbias2 = sbias2;
+    uint32_t vdwconv_cmd_value;
+    memcpy(&vdwconv_cmd_value, &dwconv_cmd, sizeof(vdwconv_cmd_value));
+    SetRegisterValues<uint32_t>(
+        {{kelvin::sim::test::kRs2Name, vdwconv_cmd_value}});
+
+    ResetDwAccumulator();
+    SetRegisterAscending<int8_t>(kVs1, 1 - sbias1);
+    SetRegisterConstant<int8_t>(kVs1 + 1, -sbias1);
+    SetRegisterConstant<int8_t>(kVs1 + 2, -sbias1);
+
+    SetRegisterConstant<int8_t>(kVs3, 1 - sbias2);
+    SetRegisterConstant<int8_t>(kVs3 + 1, -sbias2);
+    SetRegisterConstant<int8_t>(kVs3 + 2, -sbias2);
+    ExecuteDwconv();
+
+    TestAccumulatorAndRegisters(
+        [](int i, int32_t value) { EXPECT_EQ(i + 1, value); });
+  }
+
+  template <typename T, bool kWriteAcc = true>
+  void DepthwiseConvolutionRegbaseTestHelper(int regbase, int prev, int curr,
+                                             int next) {
+    constexpr int kVs1 = 0;
+    constexpr int kVs3 = 16;
+
+    kelvin::sim::vdwconv_u8_t dwconv_cmd;
+    memset(&dwconv_cmd, 0, sizeof(dwconv_cmd));
+    dwconv_cmd.regbase = regbase;
+    if (std::is_signed<T>::value) {
+      dwconv_cmd.sdata1 = 1;
+      dwconv_cmd.sdata2 = 1;
+    }
+    uint32_t vdwconv_cmd_value;
+    memcpy(&vdwconv_cmd_value, &dwconv_cmd, sizeof(vdwconv_cmd_value));
+    SetRegisterValues<uint32_t>(
+        {{kelvin::sim::test::kRs2Name, vdwconv_cmd_value}});
+
+    // Test prev reg
+    {
+      ResetDwAccumulator();
+
+      SetRegisterAscending<T>(kVs1 + prev, 1);
+      SetRegisterConstant<T>(kVs1 + curr, 0);
+      SetRegisterConstant<T>(kVs1 + next, 0);
+
+      SetRegisterConstant<T>(kVs3, 1);
+      SetRegisterConstant<T>(kVs3 + 1, 0);
+      SetRegisterConstant<T>(kVs3 + 2, 0);
+
+      ExecuteDwconv<kWriteAcc>();
+      TestAccumulatorAndRegisters<kWriteAcc>(
+          [](int i, int32_t value) { EXPECT_EQ(i + 1, value); });
+    }
+
+    // Test curr reg
+    {
+      ResetDwAccumulator();
+
+      SetRegisterConstant<T>(kVs1 + prev, 0);
+      SetRegisterAscending<T>(kVs1 + curr, 1);
+      SetRegisterConstant<T>(kVs1 + next, 0);
+
+      SetRegisterConstant<T>(kVs3, 0);
+      SetRegisterConstant<T>(kVs3 + 1, 2);
+      SetRegisterConstant<T>(kVs3 + 2, 0);
+
+      ExecuteDwconv<kWriteAcc>();
+      TestAccumulatorAndRegisters<kWriteAcc>(
+          [](int i, int32_t value) { EXPECT_EQ(2 * (i + 1), value); });
+    }
+
+    // Test next reg
+    {
+      ResetDwAccumulator();
+
+      SetRegisterConstant<T>(kVs1 + prev, 0);
+      SetRegisterConstant<T>(kVs1 + curr, 0);
+      SetRegisterAscending<T>(kVs1 + next, 1);
+
+      SetRegisterConstant<T>(kVs3, 0);
+      SetRegisterConstant<T>(kVs3 + 1, 0);
+      SetRegisterConstant<T>(kVs3 + 2, 3);
+
+      ExecuteDwconv<kWriteAcc>();
+      TestAccumulatorAndRegisters<kWriteAcc>(
+          [](int i, int32_t value) { EXPECT_EQ(3 * (i + 1), value); });
+    }
+  }
+
   void ConvolutionTestHelper(const kelvin::sim::vconv_cmd_t vconv_cmd,
                              bool expect_fail = false) {
     constexpr int kVs1 = 0;
@@ -161,4 +363,231 @@
                                      .sdata2 = true};
   ConvolutionTestHelper(vconv_cmd, true);
 }
+
+TEST_F(KelvinVectorConvolutionInstructionsTest, VDwconvRegbase) {
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(0, 0, 1, 2);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(1, 1, 2, 3);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(2, 2, 3, 4);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(3, 3, 4, 5);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(4, 4, 5, 6);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(5, 5, 6, 7);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(6, 6, 7, 8);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(7, 1, 0, 2);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(8, 1, 2, 0);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(9, 3, 4, 0);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(10, 5, 6, 0);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(11, 7, 8, 0);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(12, 2, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(13, 4, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(14, 6, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, true>(15, 8, 0, 1);
+
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(0, 0, 1, 2);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(1, 1, 2, 3);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(2, 2, 3, 4);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(3, 3, 4, 5);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(4, 4, 5, 6);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(5, 5, 6, 7);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(6, 6, 7, 8);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(7, 1, 0, 2);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(8, 1, 2, 0);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(9, 3, 4, 0);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(10, 5, 6, 0);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(11, 7, 8, 0);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(12, 2, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(13, 4, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(14, 6, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, true>(15, 8, 0, 1);
+
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(0, 0, 1, 2);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(1, 1, 2, 3);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(2, 2, 3, 4);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(3, 3, 4, 5);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(4, 4, 5, 6);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(5, 5, 6, 7);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(6, 6, 7, 8);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(7, 1, 0, 2);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(8, 1, 2, 0);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(9, 3, 4, 0);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(10, 5, 6, 0);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(11, 7, 8, 0);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(12, 2, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(13, 4, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(14, 6, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<uint8_t, false>(15, 8, 0, 1);
+
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(0, 0, 1, 2);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(1, 1, 2, 3);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(2, 2, 3, 4);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(3, 3, 4, 5);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(4, 4, 5, 6);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(5, 5, 6, 7);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(6, 6, 7, 8);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(7, 1, 0, 2);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(8, 1, 2, 0);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(9, 3, 4, 0);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(10, 5, 6, 0);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(11, 7, 8, 0);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(12, 2, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(13, 4, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(14, 6, 0, 1);
+  DepthwiseConvolutionRegbaseTestHelper<int8_t, false>(15, 8, 0, 1);
+}
+
+TEST_F(KelvinVectorConvolutionInstructionsTest, VDwconvSignBiases) {
+  DepthwiseConvolutionBiasTestHelper(2, 0);
+  DepthwiseConvolutionBiasTestHelper(0, 3);
+  DepthwiseConvolutionBiasTestHelper(5, 5);
+}
+
+TEST_F(KelvinVectorConvolutionInstructionsTest, VDwconvSparsity1) {
+  constexpr int kVs1 = 0;
+  constexpr int kVs3 = 16;
+
+  kelvin::sim::vdwconv_u8_t dwconv_cmd;
+  memset(&dwconv_cmd, 0, sizeof(dwconv_cmd));
+  dwconv_cmd.regbase = 0;
+  dwconv_cmd.sparsity = 1;
+  uint32_t vdwconv_cmd_value;
+  memcpy(&vdwconv_cmd_value, &dwconv_cmd, sizeof(vdwconv_cmd_value));
+  SetRegisterValues<uint32_t>(
+      {{kelvin::sim::test::kRs2Name, vdwconv_cmd_value}});
+
+  {
+    ResetDwAccumulator();
+
+    SetRegisterConstant<uint8_t>(kVs1, 42);
+    SetRegisterAscending<uint8_t>(kVs1 + 1, 1);
+    SetRegisterConstant<uint8_t>(kVs1 + 2, 0);
+
+    SetRegisterConstant<uint8_t>(kVs3, 1);
+    SetRegisterConstant<uint8_t>(kVs3 + 1, 0);
+    SetRegisterConstant<uint8_t>(kVs3 + 2, 0);
+
+    ExecuteDwconv();
+    TestAccumulatorAndRegisters([](int i, int32_t value) {
+      EXPECT_EQ((i % 8 == 0 ? 42 : i), value)
+          << "Incorrect value at index " << i;
+    });
+  }
+
+  {
+    ResetDwAccumulator();
+
+    SetRegisterConstant<uint8_t>(kVs1, 0);
+    SetRegisterAscending<uint8_t>(kVs1 + 1, 1);
+    SetRegisterConstant<uint8_t>(kVs1 + 2, 0);
+
+    SetRegisterConstant<uint8_t>(kVs3, 0);
+    SetRegisterConstant<uint8_t>(kVs3 + 1, 1);
+    SetRegisterConstant<uint8_t>(kVs3 + 2, 0);
+
+    ExecuteDwconv();
+    TestAccumulatorAndRegisters([](int i, int32_t value) {
+      EXPECT_EQ(i + 1, value) << "Incorrect value at index " << i;
+    });
+  }
+
+  {
+    ResetDwAccumulator();
+
+    SetRegisterConstant<uint8_t>(kVs1, 0);
+    SetRegisterAscending<uint8_t>(kVs1 + 1, 0);
+    SetRegisterConstant<uint8_t>(kVs1 + 2, 42);
+
+    SetRegisterConstant<uint8_t>(kVs3, 0);
+    SetRegisterConstant<uint8_t>(kVs3 + 1, 0);
+    SetRegisterConstant<uint8_t>(kVs3 + 2, 1);
+
+    ExecuteDwconv();
+    TestAccumulatorAndRegisters([](int i, int32_t value) {
+      EXPECT_EQ((i % 8 == 7 ? 42 : i + 1), value)
+          << "Incorrect value at index " << i;
+    });
+  }
+}
+
+TEST_F(KelvinVectorConvolutionInstructionsTest, VDwconvSparsity2) {
+  constexpr int kVs1 = 0;
+  constexpr int kVs3 = 16;
+
+  kelvin::sim::vdwconv_u8_t dwconv_cmd;
+  memset(&dwconv_cmd, 0, sizeof(dwconv_cmd));
+  dwconv_cmd.regbase = 0;
+  dwconv_cmd.sparsity = 2;
+  uint32_t vdwconv_cmd_value;
+  memcpy(&vdwconv_cmd_value, &dwconv_cmd, sizeof(vdwconv_cmd_value));
+  SetRegisterValues<uint32_t>(
+      {{kelvin::sim::test::kRs2Name, vdwconv_cmd_value}});
+
+  {
+    ResetDwAccumulator();
+
+    SetRegisterAscending<uint8_t>(kVs1, 1);
+    SetRegisterConstant<uint8_t>(kVs1 + 1, 0);
+    SetRegisterConstant<uint8_t>(kVs1 + 2, 0);
+
+    SetRegisterConstant<uint8_t>(kVs3, 1);
+    SetRegisterConstant<uint8_t>(kVs3 + 1, 0);
+    SetRegisterConstant<uint8_t>(kVs3 + 2, 0);
+
+    ExecuteDwconv();
+    TestAccumulatorAndRegisters([](int i, int32_t value) {
+      EXPECT_EQ(i + 1, value) << "Incorrect value at index " << i;
+    });
+  }
+
+  {
+    ResetDwAccumulator();
+
+    SetRegisterAscending<uint8_t>(kVs1, 0);
+    SetRegisterConstant<uint8_t>(kVs1 + 1, 42);
+    SetRegisterConstant<uint8_t>(kVs1 + 2, 0);
+
+    SetRegisterConstant<uint8_t>(kVs3, 0);
+    SetRegisterConstant<uint8_t>(kVs3 + 1, 1);
+    SetRegisterConstant<uint8_t>(kVs3 + 2, 0);
+
+    ExecuteDwconv();
+    TestAccumulatorAndRegisters([](int i, int32_t value) {
+      EXPECT_EQ((i % 8 == 7 ? 42 : i + 1), value)
+          << "Incorrect value at index " << i;
+    });
+  }
+
+  {
+    ResetDwAccumulator();
+
+    SetRegisterAscending<uint8_t>(kVs1, 0);
+    SetRegisterConstant<uint8_t>(kVs1 + 1, 42);
+    SetRegisterConstant<uint8_t>(kVs1 + 2, 0);
+
+    SetRegisterConstant<uint8_t>(kVs3, 0);
+    SetRegisterConstant<uint8_t>(kVs3 + 1, 0);
+    SetRegisterConstant<uint8_t>(kVs3 + 2, 1);
+
+    ExecuteDwconv();
+    TestAccumulatorAndRegisters([](int i, int32_t value) {
+      if (i % 8 >= 6) {
+        EXPECT_EQ(42, value) << "Incorrect value at index " << i;
+      } else {
+        EXPECT_EQ(i + 2, value) << "Incorrect value at index " << i;
+      }
+    });
+  }
+}
+
+TEST_F(KelvinVectorConvolutionInstructionsTest, VDwconvSparsity3) {
+  // Sparsity value of 3 is invalid.
+  kelvin::sim::vdwconv_u8_t dwconv_cmd;
+  memset(&dwconv_cmd, 0, sizeof(dwconv_cmd));
+  dwconv_cmd.regbase = 0;
+  dwconv_cmd.sparsity = 3;
+  uint32_t vdwconv_cmd_value;
+  memcpy(&vdwconv_cmd_value, &dwconv_cmd, sizeof(vdwconv_cmd_value));
+  SetRegisterValues<uint32_t>(
+      {{kelvin::sim::test::kRs2Name, vdwconv_cmd_value}});
+  ExecuteDwconv(/* expect_fail */ true);
+}
+
 }  // namespace
diff --git a/sim/test/kelvin_vector_memory_instructions_test.cc b/sim/test/kelvin_vector_memory_instructions_test.cc
index 524b3a4..4191c84 100644
--- a/sim/test/kelvin_vector_memory_instructions_test.cc
+++ b/sim/test/kelvin_vector_memory_instructions_test.cc
@@ -616,11 +616,17 @@
     auto vref_num = kVs + i;
     auto ref_vreg = vreg_[vref_num];
     auto ref_span = ref_vreg->data_buffer()->Get<uint8_t>();
+
+    uint8_t *dwacc_span =
+        reinterpret_cast<uint8_t *>(state_->dw_acc_vec(8 * i));
     for (int element_index = 0; element_index < ref_span.size() / 4;
          element_index++) {
       EXPECT_EQ(vreg_span[element_index], ref_span[element_index])
           << absl::StrCat("vreg[", vreg_num, "][", element_index, "] != ref[",
                           vref_num, "][", element_index, "]");
+      EXPECT_EQ(dwacc_span[element_index], ref_span[element_index])
+          << absl::StrCat("dwacc_span[", vreg_num, "][", element_index,
+                          "] != ref[", vref_num, "][", element_index, "]");
     }
   }
 }