Fix segmented load-store register layout

Order of operations from memory PoV is unchanged.
Order of register access is changed.

The fix is now complete, affected tests are enabled.

Change-Id: Ib7bf17b0d60c18a8b0b4cb8163277cd0afe8e37c
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 0e603fd..832db87 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -276,6 +276,10 @@
   val elemWidth = UInt(3.W)
   val nfields = UInt(3.W)
   val segment = UInt(3.W)
+  // Add this to find the next segment.
+  val nextSegmentVectorOffset = UInt(3.W)
+  // Subtract this to find the first segment in the next lmul.
+  val nextLmulVectorRewind = UInt(3.W)
 
   // If the slot has no pending tasks and can accept a new operation
   def slotIdle(): Bool = {
@@ -322,6 +326,8 @@
     result.baseAddr := baseAddr
     result.elemStride := elemStride
     result.segmentStride := segmentStride
+    result.nextSegmentVectorOffset := nextSegmentVectorOffset
+    result.nextLmulVectorRewind := nextLmulVectorRewind
 
 
     val segmentBaseAddr = baseAddr + (segmentStride * segment)
@@ -382,6 +388,8 @@
     result.elemWidth := elemWidth
     result.nfields := nfields
     result.segment := segment
+    result.nextSegmentVectorOffset := nextSegmentVectorOffset
+    result.nextLmulVectorRewind := nextLmulVectorRewind
 
     result
   }
@@ -405,6 +413,8 @@
     result.segmentStride := segmentStride
     result.elemWidth := elemWidth
     result.nfields := nfields
+    result.nextSegmentVectorOffset := nextSegmentVectorOffset
+    result.nextLmulVectorRewind := nextLmulVectorRewind
 
     val vectorWriteback = writeback && LsuOp.isVector(op)
 
@@ -447,7 +457,12 @@
           (baseAddr + (nfields * 16.U) + 16.U),
       // Indexed don't have base addr changed.
     ))
-    result.rd := rd + 1.U  // Move to next vector register
+    result.rd := MuxCase(rd, Seq(
+      // Finished one lmul, start from segment 0 again
+      lmulUpdate -> (rd - nextLmulVectorRewind + 1.U),
+      // Finished one segment, proceed to next
+      vectorWriteback -> (rd + nextSegmentVectorOffset),
+    ))
 
     result
   }
@@ -479,6 +494,8 @@
     result.elemWidth := elemWidth
     result.nfields := nfields
     result.segment := segment
+    result.nextSegmentVectorOffset := nextSegmentVectorOffset
+    result.nextLmulVectorRewind := nextLmulVectorRewind
     result
   }
 
@@ -524,11 +541,17 @@
     result.store := uop.store
     result.pc := uop.pc
     if (p.enableRvv) {
-      result.lmul := uop.lmul.getOrElse(0.U)
-      result.nfields := Mux(LsuOp.isVector(uop.op), uop.nfields.get, 0.U)
+      val lmul = uop.lmul.getOrElse(0.U)
+      result.lmul := lmul
+      val nfields = Mux(LsuOp.isVector(uop.op), uop.nfields.get, 0.U)
+      result.nfields := nfields
       result.segment := 0.U
+      result.nextSegmentVectorOffset := lmul
+      result.nextLmulVectorRewind := nfields * lmul
     } else {
       result.lmul := 0.U
+      result.nextSegmentVectorOffset := 0.U
+      result.nextLmulVectorRewind := 0.U
     }
 
     // All vector ops require writeback. Lsu needs to inform RVV core store uop
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 43b8c76..7efc947 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -187,10 +187,9 @@
     "load8_segment2_unit_m1",
     "load16_segment2_unit_m1",
     "load32_segment2_unit_m1",
-    # TODO(davidgao): re-enable once fixes are in.
-    # "load8_segment2_unit_m2",
-    # "load16_segment2_unit_m2",
-    # "load32_segment2_unit_m2",
+    "load8_segment2_unit_m2",
+    "load16_segment2_unit_m2",
+    "load32_segment2_unit_m2",
     "load8_segment2_stride6_m1",
     "load16_segment2_stride6_m1",
     "load8_indexed_m1",
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index 4395db2..f7d0b40 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -414,8 +414,7 @@
     )
 
 
-# TODO(davidgao): re-enable once fixes are in.
-@cocotb.test(skip=True)
+@cocotb.test()
 async def load8_segment2_unit_m2(dut):
     await vector_load_store(
         dut=dut,
@@ -427,8 +426,7 @@
     )
 
 
-# TODO(davidgao): re-enable once fixes are in.
-@cocotb.test(skip=True)
+@cocotb.test()
 async def load16_segment2_unit_m2(dut):
     await vector_load_store(
         dut=dut,
@@ -440,8 +438,7 @@
     )
 
 
-# TODO(davidgao): re-enable once fixes are in.
-@cocotb.test(skip=True)
+@cocotb.test()
 async def load32_segment2_unit_m2(dut):
     await vector_load_store(
         dut=dut,
@@ -517,10 +514,10 @@
             make_test_case('vsseg2e8_v_u8mf2x2', vl=7, n_segs=2),
             make_test_case('vsseg2e8_v_u8m1x2', vl=16, n_segs=2),
             make_test_case('vsseg2e8_v_u8m1x2', vl=15, n_segs=2),
-            # make_test_case('vsseg2e8_v_u8m2x2', vl=32, n_segs=2),
-            # make_test_case('vsseg2e8_v_u8m2x2', vl=31, n_segs=2),
-            # make_test_case('vsseg2e8_v_u8m4x2', vl=64, n_segs=2),
-            # make_test_case('vsseg2e8_v_u8m4x2', vl=63, n_segs=2),
+            make_test_case('vsseg2e8_v_u8m2x2', vl=32, n_segs=2),
+            make_test_case('vsseg2e8_v_u8m2x2', vl=31, n_segs=2),
+            make_test_case('vsseg2e8_v_u8m4x2', vl=64, n_segs=2),
+            make_test_case('vsseg2e8_v_u8m4x2', vl=63, n_segs=2),
             # Seg 3
             make_test_case('vsseg3e8_v_u8mf4x3', vl=4, n_segs=3),
             make_test_case('vsseg3e8_v_u8mf4x3', vl=3, n_segs=3),
@@ -528,8 +525,8 @@
             make_test_case('vsseg3e8_v_u8mf2x3', vl=7, n_segs=3),
             make_test_case('vsseg3e8_v_u8m1x3', vl=16, n_segs=3),
             make_test_case('vsseg3e8_v_u8m1x3', vl=15, n_segs=3),
-            # make_test_case('vsseg3e8_v_u8m2x3', vl=32, n_segs=3),
-            # make_test_case('vsseg3e8_v_u8m2x3', vl=31, n_segs=3),
+            make_test_case('vsseg3e8_v_u8m2x3', vl=32, n_segs=3),
+            make_test_case('vsseg3e8_v_u8m2x3', vl=31, n_segs=3),
             # Seg 4
             make_test_case('vsseg4e8_v_u8mf4x4', vl=4, n_segs=4),
             make_test_case('vsseg4e8_v_u8mf4x4', vl=3, n_segs=4),
@@ -537,8 +534,8 @@
             make_test_case('vsseg4e8_v_u8mf2x4', vl=7, n_segs=4),
             make_test_case('vsseg4e8_v_u8m1x4', vl=16, n_segs=4),
             make_test_case('vsseg4e8_v_u8m1x4', vl=15, n_segs=4),
-            # make_test_case('vsseg4e8_v_u8m2x4', vl=32, n_segs=4),
-            # make_test_case('vsseg4e8_v_u8m2x4', vl=31, n_segs=4),
+            make_test_case('vsseg4e8_v_u8m2x4', vl=32, n_segs=4),
+            make_test_case('vsseg4e8_v_u8m2x4', vl=31, n_segs=4),
             # Seg 5
             make_test_case('vsseg5e8_v_u8mf4x5', vl=4, n_segs=5),
             make_test_case('vsseg5e8_v_u8mf4x5', vl=3, n_segs=5),