Merge "[HW][Kelvin][Sram_1rwm_256x288.v] move FPGA defined macro"
diff --git a/hdl/chisel/BUILD b/hdl/chisel/BUILD
index cfad4b6..2785e9b 100644
--- a/hdl/chisel/BUILD
+++ b/hdl/chisel/BUILD
@@ -20,7 +20,7 @@
     deps = [
         ":common",
         ":kelvin",
-    ]
+    ],
 )
 
 chisel_cc_library(
@@ -32,7 +32,7 @@
         "//hdl/verilog:clock_gate",
         "//hdl/verilog:sram_1rw_256x256",
         "//hdl/verilog:sram_1rw_256x288",
-    ]
+    ],
 )
 
 chisel_cc_library(
@@ -70,6 +70,20 @@
 )
 
 chisel_cc_library(
+    name = "valu_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVAlu",
+    module_name = "VAlu",
+)
+
+chisel_cc_library(
+    name = "valuint_cc_library",
+    chisel_lib = ":kelvin",
+    emit_class = "kelvin.EmitVAluInt",
+    module_name = "VAluInt",
+)
+
+chisel_cc_library(
     name = "vcmdq_cc_library",
     chisel_lib = ":kelvin",
     emit_class = "kelvin.EmitVCmdq",
diff --git a/tests/verilator_sim/BUILD b/tests/verilator_sim/BUILD
index b0216ce..452c11f 100644
--- a/tests/verilator_sim/BUILD
+++ b/tests/verilator_sim/BUILD
@@ -88,7 +88,43 @@
     ],
 )
 
-# TODO(derekjchow): Add valu and valuint test benches
+cc_library(
+    name = "valu",
+    hdrs = [
+        "kelvin/alu_ref.h",
+        "kelvin/valu.h",
+    ],
+    deps = [
+        ":vencodeop",
+    ],
+)
+
+cc_test(
+    name = "valu_tb",
+    size = "large",
+    srcs = [
+        "kelvin/valu_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":valu",
+        "//hdl/chisel:valu_cc_library",
+    ],
+)
+
+cc_test(
+    name = "valuint_tb",
+    srcs = [
+        "kelvin/valuint_tb.cc",
+    ],
+    deps = [
+        ":kelvin_if",
+        ":sim_libs",
+        ":valu",
+        "//hdl/chisel:valuint_cc_library",
+    ],
+)
 
 cc_library(
     name = "vdecode",
diff --git a/tests/verilator_sim/kelvin/alu_ref.h b/tests/verilator_sim/kelvin/alu_ref.h
new file mode 100644
index 0000000..82d950f
--- /dev/null
+++ b/tests/verilator_sim/kelvin/alu_ref.h
@@ -0,0 +1,438 @@
+// Copyright 2023 Google LLC
+//
+// Reference alu ops implementation
+#ifndef TESTS_VERILATOR_SIM_KELVIN_ALU_REF_H_
+#define TESTS_VERILATOR_SIM_KELVIN_ALU_REF_H_
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+// -----------------------------------------------------------------------------
+// ALU.
+
+template <typename T>
+typename std::make_unsigned<T>::type absd(T a, T b) {
+  using UT = typename std::make_unsigned<T>::type;
+  UT ua = static_cast<UT>(a);
+  UT ub = static_cast<UT>(b);
+  return a > b ? ua - ub : ub - ua;
+}
+
+template <typename Td, typename Ts>
+Td acc(Td a, Ts b) {
+  assert(sizeof(Td) > sizeof(Ts));
+  using UTd = typename std::make_unsigned<Td>::type;
+  return static_cast<Td>(static_cast<UTd>(a) + static_cast<UTd>(b));
+}
+
+template <typename T>
+T add(T a, T b) {
+  using UT = typename std::make_unsigned<T>::type;
+  return static_cast<T>(static_cast<UT>(a) + static_cast<UT>(b));
+}
+
+template <typename T>
+T add3(T a, T b, T c) {
+  using UT = typename std::make_unsigned<T>::type;
+  return static_cast<T>(static_cast<UT>(a) + static_cast<UT>(b) +
+                        static_cast<UT>(c));
+}
+
+// Saturated addition.
+template <typename T>
+T adds(T a, T b) {
+  if (std::is_signed<T>::value) {
+    int64_t m = static_cast<int64_t>(a) + static_cast<int64_t>(b);
+    m = std::min<int64_t>(std::max<int64_t>(std::numeric_limits<T>::min(), m),
+                          std::numeric_limits<T>::max());
+    return m;
+  }
+  uint64_t m = static_cast<uint64_t>(a) + static_cast<uint64_t>(b);
+  m = std::min<uint64_t>(std::numeric_limits<T>::max(), m);
+  return m;
+}
+
+// Widening add.
+template <typename T>
+uint32_t addw(T a, T b) {
+  if (std::is_signed<T>::value) {
+    return int64_t(a) + int64_t(b);
+  }
+  return uint64_t(a) + uint64_t(b);
+}
+
+template <typename T>
+T cmp_eq(T a, T b) {
+  return a == b;
+}
+
+template <typename T>
+T cmp_ne(T a, T b) {
+  return a != b;
+}
+
+template <typename T>
+T cmp_lt(T a, T b) {
+  return a < b;
+}
+
+template <typename T>
+T cmp_le(T a, T b) {
+  return a <= b;
+}
+
+template <typename T>
+T cmp_gt(T a, T b) {
+  return a > b;
+}
+
+template <typename T>
+T cmp_ge(T a, T b) {
+  return a >= b;
+}
+
+template <typename T>
+T dup(T b) {
+  return b;
+}
+
+template <typename T>
+T log_and(T a, T b) {
+  return a & b;
+}
+
+template <typename T>
+int log_clb(T x) {
+  constexpr int n = sizeof(T) * 8;
+  if (x & (1u << (n - 1))) {
+    x = ~x;
+  }
+  for (int count = 0; count < n; count++) {
+    if ((x << count) >> (n - 1)) {
+      return count;
+    }
+  }
+  return n;
+}
+
+template <typename T>
+int log_clz(const T x) {
+  constexpr int n = sizeof(T) * 8;
+  for (int count = 0; count < n; count++) {
+    if ((x << count) >> (n - 1)) {
+      return count;
+    }
+  }
+  return n;
+}
+
+template <typename T>
+int log_cpop(T a) {
+  constexpr int n = sizeof(T) * 8;
+  int count = 0;
+  for (int i = 0; i < n; i++) {
+    if (a & (1 << i)) {
+      count++;
+    }
+  }
+  return count;
+}
+
+template <typename T>
+T log_not(T a) {
+  return ~a;
+}
+
+template <typename T>
+T log_or(T a, T b) {
+  return a | b;
+}
+
+template <typename T>
+T log_rev(T a, T b) {
+  T count = b & 0b11111;
+  if (count & 1) a = ((a & 0x55555555) << 1) | ((a & 0xAAAAAAAA) >> 1);
+  if (count & 2) a = ((a & 0x33333333) << 2) | ((a & 0xCCCCCCCC) >> 2);
+  if (count & 4) a = ((a & 0x0F0F0F0F) << 4) | ((a & 0xF0F0F0F0) >> 4);
+  if (sizeof(T) == 1) return a;
+  if (count & 8) a = ((a & 0x00FF00FF) << 8) | ((a & 0xFF00FF00) >> 8);
+  if (sizeof(T) == 2) return a;
+  if (count & 16) a = ((a & 0x0000FFFF) << 16) | ((a & 0xFFFF0000) >> 16);
+  return a;
+}
+
+template <typename T>
+T log_ror(T a, T b) {
+  if (sizeof(T) == 4) {
+    if (b & 1) a = (a >> 1) | (a << 31);
+    if (b & 2) a = (a >> 2) | (a << 30);
+    if (b & 4) a = (a >> 4) | (a << 28);
+    if (b & 8) a = (a >> 8) | (a << 24);
+    if (b & 16) a = (a >> 16) | (a << 16);
+  } else if (sizeof(T) == 2) {
+    if (b & 1) a = (a >> 1) | (a << 15);
+    if (b & 2) a = (a >> 2) | (a << 14);
+    if (b & 4) a = (a >> 4) | (a << 12);
+    if (b & 8) a = (a >> 8) | (a << 8);
+  } else if (sizeof(T) == 1) {
+    if (b & 1) a = (a >> 1) | (a << 7);
+    if (b & 2) a = (a >> 2) | (a << 6);
+    if (b & 4) a = (a >> 4) | (a << 4);
+  } else {
+    assert(false);
+  }
+  return a;
+}
+
+template <typename T>
+T log_xor(T a, T b) {
+  return a ^ b;
+}
+
+template <typename T>
+T hadd(T a, T b, int r) {
+  if (std::is_signed<T>::value) {
+    return (static_cast<int64_t>(a) + static_cast<int64_t>(b) + r) >> 1;
+  }
+  return (static_cast<uint64_t>(a) + static_cast<uint64_t>(b) + r) >> 1;
+}
+
+template <typename T>
+T hsub(T a, T b, int r) {
+  if (std::is_signed<T>::value) {
+    return (static_cast<int64_t>(a) - static_cast<int64_t>(b) + r) >> 1;
+  }
+  return (static_cast<uint64_t>(a) - static_cast<uint64_t>(b) + r) >> 1;
+}
+
+template <typename T>
+T madd(T a, T b, T c) {
+  if (std::is_signed<T>::value) {
+    return static_cast<int64_t>(a) * static_cast<int64_t>(b) +
+           static_cast<int64_t>(c);
+  }
+  return static_cast<uint64_t>(a) * static_cast<uint64_t>(b) +
+         static_cast<uint64_t>(c);
+}
+
+template <typename T>
+T max(T a, T b) {
+  return a > b ? a : b;
+}
+
+template <typename T>
+T min(T a, T b) {
+  return a < b ? a : b;
+}
+
+template <typename T>
+T mul(T a, T b) {
+  return a * b;
+}
+
+template <typename T>
+T muls(T a, T b) {
+  if (std::is_signed<T>::value) {
+    int64_t m = static_cast<int64_t>(a) * static_cast<int64_t>(b);
+    m = std::max(
+        static_cast<int64_t>(std::numeric_limits<T>::min()),
+        std::min(static_cast<int64_t>(std::numeric_limits<T>::max()), m));
+    return m;
+  }
+  uint64_t m = uint64_t(a) * uint64_t(b);
+  m = std::min(static_cast<uint64_t>(std::numeric_limits<T>::max()), m);
+  return m;
+}
+
+// Widening multiplication.
+template <typename T>
+uint32_t mulw(T a, T b) {
+  if (std::is_signed<T>::value) {
+    return static_cast<int64_t>(a) * static_cast<int64_t>(b);
+  }
+  return static_cast<uint64_t>(a) * static_cast<uint64_t>(b);
+}
+
+template <typename T>
+T mv(T a) {
+  return a;
+}
+
+template <typename T>
+std::pair<T, T> mvp(T a, T b) {
+  return {a, b};
+}
+
+template <typename T>
+T dmulh(T a, T b, bool r, bool neg) {
+  constexpr int n = sizeof(T) * 8;
+  constexpr T maxNeg = 0x80000000 >> (32 - n);
+  int64_t m = static_cast<int64_t>(a) * static_cast<int64_t>(b);
+  if (r) {
+    int64_t rnd = 0x40000000ll >> (32 - n);
+    if (m < 0 && neg) {
+      rnd = (-0x40000000ll) >> (32 - n);
+    }
+    m += rnd;
+  }
+  m >>= (n - 1);
+
+  if (a == maxNeg && b == maxNeg) {
+    m = 0x7fffffff >> (32 - n);
+  }
+
+  return m;
+}
+
+template <typename T>
+T mulh(T a, T b, bool r) {
+  constexpr int n = sizeof(T) * 8;
+  if (std::is_signed<T>::value) {
+    int64_t m = static_cast<int64_t>(a) * static_cast<int64_t>(b);
+    m += r ? 1ll << (n - 1) : 0;
+    return static_cast<uint64_t>(m) >> n;
+  }
+  uint64_t m = static_cast<uint64_t>(a) * static_cast<uint64_t>(b);
+  m += r ? 1ull << (n - 1) : 0;
+  return m >> n;
+}
+
+template <typename T>
+int32_t padd(T a, T b) {
+  if (std::is_signed<T>::value) {
+    return int64_t(a) + int64_t(b);
+  }
+  return uint64_t(a) + uint64_t(b);
+}
+
+template <typename T>
+uint32_t psub(T a, T b) {
+  if (std::is_signed<T>::value) {
+    return int64_t(a) - int64_t(b);
+  }
+  return uint64_t(a) - uint64_t(b);
+}
+
+template <typename T>
+T rsub(T a, T b) {
+  using UT = typename std::make_unsigned<T>::type;
+  return static_cast<T>(static_cast<UT>(b) - static_cast<UT>(a));
+}
+
+template <typename T>
+T shl(T a, T b) {
+  constexpr int n = sizeof(T) * 8;
+  b &= (n - 1);
+  return a << b;
+}
+
+template <typename T>
+T shr(T a, T b) {
+  constexpr int n = sizeof(T) * 8;
+  b &= (n - 1);
+  return a >> b;
+}
+
+template <typename T1, typename T2>
+T1 srans(T2 a, T1 b, bool r, bool u) {
+  static_assert(2 * sizeof(T1) == sizeof(T2) || 4 * sizeof(T1) == sizeof(T2));
+  assert(std::is_signed<T1>::value == true);
+  assert(std::is_signed<T2>::value == true);
+  constexpr int n = sizeof(T2) * 8;
+  constexpr int m = sizeof(T1) * 8;
+  b &= (n - 1);
+  int64_t s = (static_cast<int64_t>(a) + (b && r ? (1ll << (b - 1)) : 0)) >> b;
+  int64_t neg_max = !u ? -1ll << (m - 1) : 0;
+  int64_t pos_max = !u ? (1ll << (m - 1)) - 1 : (1ull << m) - 1;
+  bool neg_sat = s < neg_max;
+  bool pos_sat = s > pos_max;
+  bool zero = !a;
+  if (neg_sat) return neg_max;
+  if (pos_sat) return pos_max;
+  if (zero) return 0;
+  return s;
+}
+
+template <typename T>
+T shf(T a, T b, bool r) {
+  if (std::is_signed<T>::value == true) {
+    constexpr int n = sizeof(T) * 8;
+    int shamt = b;
+    int64_t s = a;
+    if (!a) {
+      return 0;
+    } else if (a < 0 && shamt >= n) {
+      s = -1 + r;
+    } else if (a > 0 && shamt >= n) {
+      s = 0;
+    } else if (shamt > 0) {
+      s = (static_cast<int64_t>(a) + (r ? (1ll << (shamt - 1)) : 0)) >> shamt;
+    } else {  // shmat < 0
+      using UT = typename std::make_unsigned<T>::type;
+      UT ushamt = static_cast<UT>(-shamt <= n ? -shamt : n);
+      s = static_cast<int64_t>(static_cast<uint64_t>(a) << ushamt);
+    }
+
+    int64_t neg_max = -1ll << (n - 1);
+    int64_t pos_max = (1ll << (n - 1)) - 1;
+    bool neg_sat = a < 0 && (shamt <= -n || s < neg_max);
+    bool pos_sat = a > 0 && (shamt <= -n || s > pos_max);
+    if (neg_sat) return neg_max;
+    if (pos_sat) return pos_max;
+
+    return s;
+  }
+  constexpr int n = sizeof(T) * 8;
+  int shamt = static_cast<typename std::make_signed<T>::type>(b);
+  uint64_t s = a;
+  if (!a) {
+    return 0;
+  } else if (shamt > n) {
+    s = 0;
+  } else if (shamt > 0) {
+    s = (static_cast<uint64_t>(a) + (r ? (1ull << (shamt - 1)) : 0)) >> shamt;
+  } else {  // shamt < 0
+    T ushamt = static_cast<T>(-shamt <= n ? -shamt : n);
+    s = static_cast<uint64_t>(a) << (ushamt);
+  }
+
+  uint64_t pos_max = (1ull << n) - 1;
+  bool pos_sat = a && (shamt < -n || s >= (1ull << n));
+  if (pos_sat) return pos_max;
+
+  return s;
+}
+
+template <typename T>
+T sub(T a, T b) {
+  using UT = typename std::make_unsigned<T>::type;
+  return static_cast<T>(static_cast<UT>(a) - static_cast<UT>(b));
+}
+
+// Saturated subtraction.
+template <typename T>
+T subs(T a, T b) {
+  if (std::is_signed<T>::value) {
+    int64_t m = static_cast<int64_t>(a) - static_cast<int64_t>(b);
+    m = std::min<int64_t>(std::max<int64_t>(std::numeric_limits<T>::min(), m),
+                          std::numeric_limits<T>::max());
+    return m;
+  }
+  uint64_t m = static_cast<uint64_t>(a) - static_cast<uint64_t>(b);
+  m = std::min<uint64_t>(m, std::numeric_limits<T>::max());
+  return m;
+}
+
+template <typename T>
+uint32_t subw(T a, T b) {
+  if (std::is_signed<T>::value) {
+    return static_cast<int64_t>(a) - static_cast<int64_t>(b);
+  }
+  return static_cast<uint64_t>(a) - static_cast<uint64_t>(b);
+}
+
+#endif  // TESTS_VERILATOR_SIM_KELVIN_ALU_REF_H_
diff --git a/tests/verilator_sim/kelvin/valu.h b/tests/verilator_sim/kelvin/valu.h
index afd7b28..0719491 100644
--- a/tests/verilator_sim/kelvin/valu.h
+++ b/tests/verilator_sim/kelvin/valu.h
@@ -3,7 +3,7 @@
 #ifndef TESTS_VERILATOR_SIM_KELVIN_VALU_H_
 #define TESTS_VERILATOR_SIM_KELVIN_VALU_H_
 
-#include "tools/iss/alu.h"  // Modified
+#include "tests/verilator_sim/kelvin/alu_ref.h"
 #include "tests/verilator_sim/kelvin/kelvin_cfg.h"
 #include "tests/verilator_sim/kelvin/vencodeop.h"
 
@@ -11,8 +11,6 @@
 constexpr int kReadPorts = 7;
 constexpr int kWritePorts = 4;
 
-using namespace encode;
-
 struct valu_t {
   uint8_t op : 7;
   uint8_t f2 : 3;
@@ -98,7 +96,7 @@
     x = func(uint32_t(a));                                           \
   }
 
-#define VOP1PU(func)                                                  \
+#define VOP1PU(func)                                                 \
   if (sz == 1) {                                                     \
     v = 1;                                                           \
     w = 1;                                                           \
@@ -691,9 +689,9 @@
 void VSlidevn(valu_t& op) {
   constexpr int n = kLanes * 4 / sizeof(T);
   const int shfamt = (op.f2 & 3) + 1;
-  const T* in0 = (const T*)op.in[0].data;
-  const T* in1 = (const T*)op.in[1].data;
-  T* out = (T*)op.out[0].data;
+  const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
+  const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
+  T* out = reinterpret_cast<T*>(op.out[0].data);
   for (int i = 0; i < n; ++i) {
     out[i] = i + shfamt < n ? in0[i + shfamt] : in1[i + shfamt - n];
   }
@@ -704,9 +702,9 @@
 void VSlidevp(valu_t& op) {
   constexpr int n = kLanes * 4 / sizeof(T);
   const int shfamt = (op.f2 & 3) + 1;
-  const T* in0 = (const T*)op.in[0].data;
-  const T* in1 = (const T*)op.in[1].data;
-  T* out = (T*)op.out[0].data;
+  const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
+  const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
+  T* out = reinterpret_cast<T*>(op.out[0].data);
   for (int i = 0; i < n; ++i) {
     out[i] = i - shfamt < 0 ? in0[n - shfamt + i] : in1[i - shfamt];
   }
@@ -717,11 +715,11 @@
 void VSlidehn2(valu_t& op) {
   constexpr int n = kLanes * 4 / sizeof(T);
   const int shfamt = (op.f2 & 3) + 1;
-  const T* in0 = (const T*)op.in[0].data;
-  const T* in1 = (const T*)op.in[1].data;
-  const T* in2 = (const T*)op.in[2].data;
-  T* out0 = (T*)op.out[0].data;
-  T* out1 = (T*)op.out[1].data;
+  const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
+  const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
+  const T* in2 = reinterpret_cast<const T*>(op.in[2].data);
+  T* out0 = reinterpret_cast<T*>(op.out[0].data);
+  T* out1 = reinterpret_cast<T*>(op.out[1].data);
   for (int i = 0; i < n; ++i) {
     out0[i] = i + shfamt < n ? in0[i + shfamt] : in1[i + shfamt - n];
   }
@@ -736,11 +734,11 @@
 void VSlidehp2(valu_t& op) {
   constexpr int n = kLanes * 4 / sizeof(T);
   const int shfamt = (op.f2 & 3) + 1;
-  const T* in0 = (const T*)op.in[0].data;
-  const T* in1 = (const T*)op.in[1].data;
-  const T* in2 = (const T*)op.in[2].data;
-  T* out0 = (T*)op.out[0].data;
-  T* out1 = (T*)op.out[1].data;
+  const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
+  const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
+  const T* in2 = reinterpret_cast<const T*>(op.in[2].data);
+  T* out0 = reinterpret_cast<T*>(op.out[0].data);
+  T* out1 = reinterpret_cast<T*>(op.out[1].data);
   for (int i = 0; i < n; ++i) {
     out0[i] = i - shfamt < 0 ? in0[n - shfamt + i] : in1[i - shfamt];
   }
@@ -798,11 +796,10 @@
 template <typename T>
 void VSel(valu_t& op) {
   constexpr int n = kLanes * 4 / sizeof(T);
-  const int shfamt = (op.f2 & 3) + 1;
-  const T* in0 = (const T*)op.in[0].data;
-  const T* in1 = (const T*)op.in[1].data;
-  const T* in2 = (const T*)op.in[2].data;
-  T* out = (T*)op.out[0].data;
+  const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
+  const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
+  const T* in2 = reinterpret_cast<const T*>(op.in[2].data);
+  T* out = reinterpret_cast<T*>(op.out[0].data);
   for (int i = 0; i < n; ++i) {
     out[i] = in0[i] & 1 ? in2[i] : in1[i];
   }
@@ -823,10 +820,9 @@
 template <typename T>
 void VEvn(valu_t& op) {
   constexpr int n = kLanes * 4 / sizeof(T);
-  constexpr int h = n / 2;
-  const T* in0 = (const T*)op.in[0].data;
-  const T* in1 = (const T*)op.in[1].data;
-  T* out0 = (T*)op.out[0].data;
+  const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
+  const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
+  T* out0 = reinterpret_cast<T*>(op.out[0].data);
   for (int i = 0; i < n; ++i) {
     out0[i] = i < n / 2 ? in0[2 * i + 0] : in1[2 * (i - n / 2) + 0];
   }
@@ -836,10 +832,9 @@
 template <typename T>
 void VOdd(valu_t& op) {
   constexpr int n = kLanes * 4 / sizeof(T);
-  constexpr int h = n / 2;
-  const T* in0 = (const T*)op.in[0].data;
-  const T* in1 = (const T*)op.in[1].data;
-  T* out1 = (T*)op.out[1].data;
+  const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
+  const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
+  T* out1 = reinterpret_cast<T*>(op.out[1].data);
   for (int i = 0; i < n; ++i) {
     out1[i] = i < n / 2 ? in0[2 * i + 1] : in1[2 * (i - n / 2) + 1];
   }
@@ -889,10 +884,10 @@
 void VZip(valu_t& op) {
   constexpr int n = kLanes * 4 / sizeof(T);
   constexpr int h = n / 2;
-  const T* in0 = (const T*)op.in[0].data;
-  const T* in1 = (const T*)op.in[1].data;
-  T* out0 = (T*)op.out[0].data;
-  T* out1 = (T*)op.out[1].data;
+  const T* in0 = reinterpret_cast<const T*>(op.in[0].data);
+  const T* in1 = reinterpret_cast<const T*>(op.in[1].data);
+  T* out0 = reinterpret_cast<T*>(op.out[0].data);
+  T* out1 = reinterpret_cast<T*>(op.out[1].data);
   for (int i = 0; i < n; ++i) {
     const int j = i / 2;
     out0[i] = i & 1 ? in1[j + 0] : in0[j + 0];
@@ -937,29 +932,29 @@
 }
 
 static void VDwconv(valu_t& op) {
-  const uint32_t* in0 = (const uint32_t*)op.in[0].data;
-  const uint32_t* in1 = (const uint32_t*)op.in[1].data;
-  const uint32_t* in2 = (const uint32_t*)op.in[2].data;
-  const uint32_t* in3 = (const uint32_t*)op.in[3].data;
-  const uint32_t* in4 = (const uint32_t*)op.in[4].data;
-  const uint32_t* in5 = (const uint32_t*)op.in[5].data;
-  uint32_t* out0 = (uint32_t*)op.out[0].data;
-  uint32_t* out1 = (uint32_t*)op.out[1].data;
-  uint32_t* out2 = (uint32_t*)op.out[2].data;
-  uint32_t* out3 = (uint32_t*)op.out[3].data;
+  const uint32_t* in0 = reinterpret_cast<const uint32_t*>(op.in[0].data);
+  const uint32_t* in1 = reinterpret_cast<const uint32_t*>(op.in[1].data);
+  const uint32_t* in2 = reinterpret_cast<const uint32_t*>(op.in[2].data);
+  const uint32_t* in3 = reinterpret_cast<const uint32_t*>(op.in[3].data);
+  const uint32_t* in4 = reinterpret_cast<const uint32_t*>(op.in[4].data);
+  const uint32_t* in5 = reinterpret_cast<const uint32_t*>(op.in[5].data);
+  uint32_t* out0 = reinterpret_cast<uint32_t*>(op.out[0].data);
+  uint32_t* out1 = reinterpret_cast<uint32_t*>(op.out[1].data);
+  uint32_t* out2 = reinterpret_cast<uint32_t*>(op.out[2].data);
+  uint32_t* out3 = reinterpret_cast<uint32_t*>(op.out[3].data);
 
   struct vdwconv_u8_t {
-    uint32_t mode : 2;      // 1:0
-    uint32_t sparsity : 2;  // 3:2
-    uint32_t regbase : 4;   // 7:4
-    uint32_t rsvd : 4;      // 11:8
-    uint32_t abias : 9;     // 20:12
-    uint32_t asign : 1;     // 21
-    uint32_t bbias : 9;     // 30:22
-    uint32_t bsign : 1;     // 31
+    uint32_t mode : 2;      // 31:30
+    uint32_t sparsity : 2;  // 29:28
+    uint32_t regbase : 4;   // 27:24
+    uint32_t rsvd : 4;      // 23:20
+    uint32_t abias : 9;     // 19:11
+    uint32_t asign : 1;     // 10
+    uint32_t bbias : 9;     // 9:1
+    uint32_t bsign : 1;     // 0
   } cmd;
 
-  uint32_t* p_cmd = (uint32_t*)&cmd;
+  uint32_t* p_cmd = reinterpret_cast<uint32_t*>(&cmd);
   *p_cmd = op.sv.data;
   assert(cmd.mode == 0);
   assert(cmd.rsvd == 0);
@@ -970,7 +965,8 @@
   const bool bsign = cmd.bsign;
 
   constexpr int n = kVector / 32;
-  uint32_t sparse[n + 2];
+  constexpr int kSparseSize = n + 2;
+  uint32_t sparse[kSparseSize];
   if (cmd.sparsity == 1) {
     sparse[0] = in0[n - 1];
     for (int i = 0; i < kVector / 32; ++i) {
@@ -1009,18 +1005,18 @@
 static void VAlu(valu_t& op) {
   // clang-format off
   switch (op.op) {
-    case vslidevn: VSlidevn(op); return;
-    case vslidevp: VSlidevp(op); return;
-    case vslidehn: VSlidevn(op); return;
-    case vslidehp: VSlidevp(op); return;
-    case vslidehn2: VSlidehn2(op); return;
-    case vslidehp2: VSlidehp2(op); return;
-    case vsel: VSel(op); return;
-    case vevn: VEvn(op); return;
-    case vodd: VOdd(op); return;
-    case vevnodd: VEvnOdd(op); return;
-    case vzip: VZip(op); return;
-    case vdwconv: VDwconv(op); return;
+    case encode::vslidevn: VSlidevn(op); return;
+    case encode::vslidevp: VSlidevp(op); return;
+    case encode::vslidehn: VSlidevn(op); return;
+    case encode::vslidehp: VSlidevp(op); return;
+    case encode::vslidehn2: VSlidehn2(op); return;
+    case encode::vslidehp2: VSlidehp2(op); return;
+    case encode::vsel: VSel(op); return;
+    case encode::vevn: VEvn(op); return;
+    case encode::vodd: VOdd(op); return;
+    case encode::vevnodd: VEvnOdd(op); return;
+    case encode::vzip: VZip(op); return;
+    case encode::vdwconv: VDwconv(op); return;
   }
   // clang-format on
 
@@ -1031,72 +1027,70 @@
     const uint32_t b = op.in[1].data[i];
     const uint32_t c = op.in[2].data[i];
     const uint32_t d = op.in[3].data[i];
-    const uint32_t e = op.in[4].data[i];
     const uint32_t f = op.in[5].data[i];
-    const uint32_t g = op.in[6].data[i];
     bool v = false;
     bool w = false;
     uint32_t x = 0;
     uint32_t y = 0;
 
-    const bool f2_negative =
-        ((f2 >> 0) & 1) && (op.op == vdmulh || op.op == vdmulh2);
+    const bool f2_negative = ((f2 >> 0) & 1) && (op.op == encode::vdmulh ||
+                                                 op.op == encode::vdmulh2);
     const bool f2_round = (f2 >> 1) & 1;
     const bool f2_signed =
-        !((f2 >> 0) & 1) || op.op == vdmulh || op.op == vdmulh2;
+        !((f2 >> 0) & 1) || op.op == encode::vdmulh || op.op == encode::vdmulh2;
 
     // clang-format off
     switch (op.op) {
-      case vdup:    VOPXU(dup); break;
-      case vadd:    VOP2U(add); break;
-      case vsub:    VOP2U(sub); break;
-      case vrsub:   VOP2U(rsub); break;
-      case veq:     VOP2U(cmp_eq); break;
-      case vne:     VOP2U(cmp_ne); break;
-      case vlt:     VOP2(cmp_lt); break;
-      case vle:     VOP2(cmp_le); break;
-      case vgt:     VOP2(cmp_gt); break;
-      case vge:     VOP2(cmp_ge); break;
-      case vabsd:   VOP2(absd); break;
-      case vmax:    VOP2(max); break;
-      case vmin:    VOP2(min); break;
-      case vadd3:   VOP3U(add3); break;
-      case vand:    VOP2U(log_and); break;
-      case vor:     VOP2U(log_or); break;
-      case vxor:    VOP2U(log_xor); break;
-      case vnot:    VOP1U(log_not); break;
-      case vrev:    VOP2U(log_rev); break;
-      case vror:    VOP2U(log_ror); break;
-      case vclb:    VOP1U(log_clb); break;
-      case vclz:    VOP1U(log_clz); break;
-      case vcpop:   VOP1U(log_cpop); break;
-      case vmv:     VOP1U(mv); break;
-      case vmv2:    VOP1PU(mv); break;
-      case vmvp:    VOP2M(mvp); break;
-      case vshl:    VOP2U(shl); break;
-      case vshr:    VOP2(shr); break;
-      case vshf:    VOP2_R(shf, f2_round); break;
-      case vsrans:  VOP3NS_R_U(srans, f2_round, !f2_signed); break;
-      case vsraqs:  VOP3QS_R_U(srans, f2_round, !f2_signed); break;
-      case vmul:    VOP2S(mul); break;
-      case vmul2:   VOP2PS(mul); break;
-      case vmuls:   VOP2(muls); break;
-      case vmuls2:  VOP2P(muls); break;
-      case vmulw:   WOP2(mulw); break;
-      case vmulh:   VOP2_R(mulh, f2_round); break;
-      case vmulh2:  VOP2P_R(mulh, f2_round); break;
-      case vdmulh:  VOP2_R_X(dmulh, f2_round, f2_negative); break;
-      case vdmulh2: VOP2P_R_X(dmulh, f2_round, f2_negative); break;
-      case vmadd:   VOP3(madd); break;
-      case vadds:   VOP2(adds); break;
-      case vsubs:   VOP2(subs); break;
-      case vaddw:   WOP2(addw); break;
-      case vsubw:   WOP2(subw); break;
-      case vacc:    WOPA(acc); break;
-      case vpadd:   VOPP(padd); break;
-      case vpsub:   VOPP(psub); break;
-      case vhadd:   VOP2_R(hadd, f2_round); break;
-      case vhsub:   VOP2_R(hsub, f2_round); break;
+      case encode::vdup:    VOPXU(dup); break;
+      case encode::vadd:    VOP2U(add); break;
+      case encode::vsub:    VOP2U(sub); break;
+      case encode::vrsub:   VOP2U(rsub); break;
+      case encode::veq:     VOP2U(cmp_eq); break;
+      case encode::vne:     VOP2U(cmp_ne); break;
+      case encode::vlt:     VOP2(cmp_lt); break;
+      case encode::vle:     VOP2(cmp_le); break;
+      case encode::vgt:     VOP2(cmp_gt); break;
+      case encode::vge:     VOP2(cmp_ge); break;
+      case encode::vabsd:   VOP2(absd); break;
+      case encode::vmax:    VOP2(max); break;
+      case encode::vmin:    VOP2(min); break;
+      case encode::vadd3:   VOP3U(add3); break;
+      case encode::vand:    VOP2U(log_and); break;
+      case encode::vor:     VOP2U(log_or); break;
+      case encode::vxor:    VOP2U(log_xor); break;
+      case encode::vnot:    VOP1U(log_not); break;
+      case encode::vrev:    VOP2U(log_rev); break;
+      case encode::vror:    VOP2U(log_ror); break;
+      case encode::vclb:    VOP1U(log_clb); break;
+      case encode::vclz:    VOP1U(log_clz); break;
+      case encode::vcpop:   VOP1U(log_cpop); break;
+      case encode::vmv:     VOP1U(mv); break;
+      case encode::vmv2:    VOP1PU(mv); break;
+      case encode::vmvp:    VOP2M(mvp); break;
+      case encode::vshl:    VOP2U(shl); break;
+      case encode::vshr:    VOP2(shr); break;
+      case encode::vshf:    VOP2_R(shf, f2_round); break;
+      case encode::vsrans:  VOP3NS_R_U(srans, f2_round, !f2_signed); break;
+      case encode::vsraqs:  VOP3QS_R_U(srans, f2_round, !f2_signed); break;
+      case encode::vmul:    VOP2S(mul); break;
+      case encode::vmul2:   VOP2PS(mul); break;
+      case encode::vmuls:   VOP2(muls); break;
+      case encode::vmuls2:  VOP2P(muls); break;
+      case encode::vmulw:   WOP2(mulw); break;
+      case encode::vmulh:   VOP2_R(mulh, f2_round); break;
+      case encode::vmulh2:  VOP2P_R(mulh, f2_round); break;
+      case encode::vdmulh:  VOP2_R_X(dmulh, f2_round, f2_negative); break;
+      case encode::vdmulh2: VOP2P_R_X(dmulh, f2_round, f2_negative); break;
+      case encode::vmadd:   VOP3(madd); break;
+      case encode::vadds:   VOP2(adds); break;
+      case encode::vsubs:   VOP2(subs); break;
+      case encode::vaddw:   WOP2(addw); break;
+      case encode::vsubw:   WOP2(subw); break;
+      case encode::vacc:    WOPA(acc); break;
+      case encode::vpadd:   VOPP(padd); break;
+      case encode::vpsub:   VOPP(psub); break;
+      case encode::vhadd:   VOP2_R(hadd, f2_round); break;
+      case encode::vhsub:   VOP2_R(hsub, f2_round); break;
     }
     // clang-format on
 
diff --git a/tests/verilator_sim/kelvin/valu_tb.cc b/tests/verilator_sim/kelvin/valu_tb.cc
index f3ff74b..5d05ade 100644
--- a/tests/verilator_sim/kelvin/valu_tb.cc
+++ b/tests/verilator_sim/kelvin/valu_tb.cc
@@ -1,8 +1,8 @@
 // Copyright 2023 Google LLC
 
 #include "VVAlu.h"
-#include "sysc_tb.h"
-#include "valu.h"
+#include "tests/verilator_sim/kelvin/valu.h"
+#include "tests/verilator_sim/sysc_tb.h"
 
 struct VAlu_tb : Sysc_tb {
   sc_in<bool> io_in_ready;
@@ -431,7 +431,7 @@
     bool valid = rand_int(0, 3);
     inputs_t in;
 
-    in.op = rand_int(0, kOpEntries - 1);
+    in.op = rand_int(0, encode::kOpEntries - 1);
     in.f2 = rand_int(0, 7);
     in.sz = 1u << rand_int(0, 2);
     in.m = rand_int(0, 7) == 0;
@@ -440,14 +440,15 @@
     in.sv.addr = rand_uint32();
     in.sv.data = rand_uint32();
 
-    if (in.op == vevn || in.op == vevnodd || in.op == vodd) {
+    if (in.op == encode::vevn || in.op == encode::vevnodd ||
+        in.op == encode::vodd) {
       // Disallow even/odd in CRT.
-      in.op = vadd;
+      in.op = encode::vadd;
     }
 
-    if (in.op == vdwconv) {
+    if (in.op == encode::vdwconv) {
       // Disallow DW in CRT.
-      in.op = vadd;
+      in.op = encode::vadd;
     }
 
     // Assign random values to inactive read addr/tag.
@@ -464,69 +465,69 @@
     }
 
     switch (in.op) {
-      case vabsd:
-      case vadd:
-      case vadds:
-      case vhadd:
-      case vhsub:
-      case vmax:
-      case vmin:
-      case vrsub:
-      case vsub:
-      case vsubs:
-      case veq:
-      case vne:
-      case vlt:
-      case vle:
-      case vgt:
-      case vge:
-      case vand:
-      case vclb:
-      case vclz:
-      case vcpop:
-      case vevn:
-      case vor:
-      case vrev:
-      case vror:
-      case vxor:
-      case vdmulh:
-      case vmul:
-      case vmulh:
-      case vmuls:
-      case vshl:
-      case vshr:
-      case vshf:
+      case encode::vabsd:
+      case encode::vadd:
+      case encode::vadds:
+      case encode::vhadd:
+      case encode::vhsub:
+      case encode::vmax:
+      case encode::vmin:
+      case encode::vrsub:
+      case encode::vsub:
+      case encode::vsubs:
+      case encode::veq:
+      case encode::vne:
+      case encode::vlt:
+      case encode::vle:
+      case encode::vgt:
+      case encode::vge:
+      case encode::vand:
+      case encode::vclb:
+      case encode::vclz:
+      case encode::vcpop:
+      case encode::vevn:
+      case encode::vor:
+      case encode::vrev:
+      case encode::vror:
+      case encode::vxor:
+      case encode::vdmulh:
+      case encode::vmul:
+      case encode::vmulh:
+      case encode::vmuls:
+      case encode::vshl:
+      case encode::vshr:
+      case encode::vshf:
         in.r[0].valid = true;
         in.r[1].valid = true;
         in.w[0].valid = true;
         break;
-      case vaddw:
-      case vevnodd:
-      case vsubw:
-      case vmulw:
-      case vmvp:
-      case vzip:
+      case encode::vaddw:
+      case encode::vevnodd:
+      case encode::vsubw:
+      case encode::vmulw:
+      case encode::vmvp:
+      case encode::vzip:
         in.r[0].valid = true;
         in.r[1].valid = true;
         in.w[0].valid = true;
         in.w[1].valid = true;
         break;
-      case vacc:
+      case encode::vacc:
         in.r[0].valid = true;
         in.r[1].valid = true;
         in.r[2].valid = true;
         in.w[0].valid = true;
         in.w[1].valid = true;
         break;
-      case vadd3:
-      case vmadd:
-      case vsrans:
+      case encode::vadd3:
+      case encode::vmadd:
+      case encode::vsrans:
         in.r[0].valid = true;
         in.r[1].valid = true;
         in.r[2].valid = true;
         in.w[0].valid = true;
         break;
-      case vsraqs:
+      case encode::vsraqs:
         in.r[0].valid = true;
         in.r[1].valid = true;
         in.r[2].valid = true;
@@ -535,22 +536,22 @@
         in.w[0].valid = true;
         in.cmdsync = true;
         break;
-      case vdup:
+      case encode::vdup:
         in.r[1].valid = true;
         in.w[0].valid = true;
         break;
-      case vmv:
-      case vpadd:
-      case vpsub:
+      case encode::vmv:
+      case encode::vpadd:
+      case encode::vpsub:
         in.r[0].valid = true;
         in.w[0].valid = true;
         break;
-      case vodd:
+      case encode::vodd:
         in.r[0].valid = true;
         in.r[1].valid = true;
         in.w[1].valid = true;
         break;
-      case vdwconv:
+      case encode::vdwconv:
         in.r[0].valid = true;
         in.r[1].valid = true;
         in.r[2].valid = true;
@@ -580,7 +581,7 @@
     }
 
     // Assign inactive write addresses.
-    if (in.op == vzip) {
+    if (in.op == encode::vzip) {
       int addr = 0;
       valid = valid && FindInactiveWriteAddr2(in.m, wactive, addr);
       in.w[0].valid = valid;
@@ -637,7 +638,7 @@
         for (int i = 0; i < kWritePorts; ++i) {
           if (alu.w[i].valid) {
             wactive |= 1ull << waddr[i];
-            if (in.op == vzip) {
+            if (in.op == encode::vzip) {
               waddr[i] += 2;
             } else {
               waddr[i]++;  // stripmine update
@@ -697,10 +698,10 @@
   void ProcessInputs(const int idx) {
     // clang-format off
     if (!(io_in_valid && io_in_ready) ||
-        idx == 0 && !io_in_bits_0_valid ||
-        idx == 1 && !io_in_bits_1_valid ||
-        idx == 2 && !io_in_bits_2_valid ||
-        idx == 3 && !io_in_bits_3_valid) {
+        (idx == 0 && !io_in_bits_0_valid) ||
+        (idx == 1 && !io_in_bits_1_valid) ||
+        (idx == 2 && !io_in_bits_2_valid) ||
+        (idx == 3 && !io_in_bits_3_valid)) {
       cmdq_[idx].clear();
       return;
     }
@@ -741,10 +742,10 @@
 
   void ProcessOutputs(const int idx) {
     // clang-format off
-    if (idx == 0 && !io_write_0_valid ||
-        idx == 1 && !io_write_1_valid ||
-        idx == 2 && !io_write_2_valid ||
-        idx == 3 && !io_write_3_valid) {
+    if ((idx == 0 && !io_write_0_valid) ||
+        (idx == 1 && !io_write_1_valid) ||
+        (idx == 2 && !io_write_2_valid) ||
+        (idx == 3 && !io_write_3_valid)) {
       return;
     }
     // clang-format on
@@ -771,8 +772,8 @@
 
     if (memcmp(dut, ref, kLanes * 4)) {
       char s[100];
-      sprintf(s, "valu op=%d f2=%d sz=%d", write_[addr].op, write_[addr].f2,
-              write_[addr].sz);
+      snprintf(s, sizeof(s), "valu op=%d f2=%d sz=%d", write_[addr].op,
+               write_[addr].f2, write_[addr].sz);
       printf("ref[%2d]  ", addr);
       for (int i = 0; i < kLanes; ++i) {
         printf(" %08x", ref[i]);
diff --git a/tests/verilator_sim/kelvin/valuint_tb.cc b/tests/verilator_sim/kelvin/valuint_tb.cc
index f86ad37..05999be 100644
--- a/tests/verilator_sim/kelvin/valuint_tb.cc
+++ b/tests/verilator_sim/kelvin/valuint_tb.cc
@@ -1,12 +1,12 @@
 // Copyright 2023 Google LLC
 
-#include "VVAluInt.h"
-#include "sysc_tb.h"
-#include "valu.h"
+#include "VVAluInt.h"  // Generated.
+#include "tests/verilator_sim/kelvin/valu.h"
+#include "tests/verilator_sim/sysc_tb.h"
 
 struct VAluInt_tb : Sysc_tb {
   sc_out<bool> io_in_valid;
-  sc_out<sc_bv<kOpBits> > io_in_op;
+  sc_out<sc_bv<encode::kOpBits> > io_in_op;
   sc_out<sc_bv<3> > io_in_f2;
   sc_out<sc_bv<3> > io_in_sz;
   sc_out<sc_bv<6> > io_in_vd_addr;
@@ -41,15 +41,15 @@
     const uint8_t ve_addr = rand_int(0, 63);
     uint32_t sv_data = 0;
 
-    uint8_t op = rand_int(0, kOpEntries - 1);
+    uint8_t op = rand_int(0, encode::kOpEntries - 1);
 
     // Inputs.
     valu_t r = {0};
     r_.read(r);
 
-    if (op == vdwconv) {
+    if (op == encode::vdwconv) {
       // Disallow DW in CRT.
-      op = 0;  // TODO
+      op = 0;
     }
 
     io_in_valid = valid;
@@ -142,7 +142,7 @@
 
 static void VAluInt_test(char* name, int loops, bool trace) {
   sc_signal<bool> io_in_valid;
-  sc_signal<sc_bv<kOpBits> > io_in_op;
+  sc_signal<sc_bv<encode::kOpBits> > io_in_op;
   sc_signal<sc_bv<3> > io_in_f2;
   sc_signal<sc_bv<3> > io_in_sz;
   sc_signal<sc_bv<6> > io_in_vd_addr;