Handle masked loads and stores.

Change-Id: I107037ba52bf39cca0e2883bc46401422deb04ae
diff --git a/hdl/chisel/src/kelvin/scalar/Decode.scala b/hdl/chisel/src/kelvin/scalar/Decode.scala
index 02402c8..6144608 100644
--- a/hdl/chisel/src/kelvin/scalar/Decode.scala
+++ b/hdl/chisel/src/kelvin/scalar/Decode.scala
@@ -706,6 +706,7 @@
     if (p.enableRvv) {
       io.lsu(i).bits.elemWidth.get := io.inst(i).bits.inst(14,12)
       io.lsu(i).bits.nfields.get := io.inst(i).bits.inst(31,29)
+      io.lsu(i).bits.umop.get := io.inst(i).bits.inst(24,20)
     }
 
     // -------------------------------------------------------------------------
@@ -1201,6 +1202,7 @@
   if (p.enableRvv) {
     io.lsu.bits.elemWidth.get := io.inst.bits.inst(14,12)
     io.lsu.bits.nfields.get := io.inst.bits.inst(31,29)
+    io.lsu.bits.umop.get := io.inst.bits.inst(24,20)
   }
 
   // MLU opcode.
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 693815b..0e603fd 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -142,6 +142,16 @@
   val pc = UInt(32.W)
   val elemWidth = Option.when(p.enableRvv) { UInt(3.W) }
   val nfields = Option.when(p.enableRvv) { UInt(3.W) }
+  val umop = Option.when(p.enableRvv) { UInt(5.W) }
+
+  def isMaskOperation(): Bool = {
+    if (p.enableRvv) {
+      (umop.get === "b01011".U) &&
+      op.isOneOf(LsuOp.VLOAD_UNIT, LsuOp.VSTORE_UNIT)
+    } else {
+      false.B
+    }
+  }
 
   override def toPrintable: Printable = {
     cf"LsuCmd(store -> ${store}, addr -> 0x${addr}%x, op -> ${op}, " +
@@ -188,11 +198,15 @@
     }
     if (p.enableRvv) {
       result.elemWidth.get := cmd.elemWidth.get
-      result.nfields.get := cmd.nfields.get
-      // Treat fractional LMULs as LMUL=1
-      val effectiveLmul = Mux(rvvState.get.bits.lmul(2),
-                              0.U(2.W), rvvState.get.bits.lmul(1, 0))
+      val effectiveLmul = MuxCase(rvvState.get.bits.lmul(1, 0), Seq(
+        // Treat fractional LMULs as LMUL=1
+        (rvvState.get.bits.lmul(2)) -> 0.U(2.W),
+        // If mask operation, always force LMUL=1.
+        (cmd.isMaskOperation()) -> 0.U(2.W),
+      ))
       result.lmul.get := 1.U(1.W) << effectiveLmul
+      // If mask operation, force fields to zero
+      result.nfields.get := Mux(cmd.isMaskOperation(), 0.U, cmd.nfields.get)
     }
 
     result