Fix segmented load-store register layout
Order of operations from memory PoV is unchanged.
Order of register access is changed.
The fix is now complete, affected tests are enabled.
Change-Id: Ib7bf17b0d60c18a8b0b4cb8163277cd0afe8e37c
diff --git a/hdl/chisel/src/kelvin/scalar/Lsu.scala b/hdl/chisel/src/kelvin/scalar/Lsu.scala
index 0e603fd..832db87 100644
--- a/hdl/chisel/src/kelvin/scalar/Lsu.scala
+++ b/hdl/chisel/src/kelvin/scalar/Lsu.scala
@@ -276,6 +276,10 @@
val elemWidth = UInt(3.W)
val nfields = UInt(3.W)
val segment = UInt(3.W)
+ // Add this to find the next segment.
+ val nextSegmentVectorOffset = UInt(3.W)
+ // Subtract this to find the first segment in the next lmul.
+ val nextLmulVectorRewind = UInt(3.W)
// If the slot has no pending tasks and can accept a new operation
def slotIdle(): Bool = {
@@ -322,6 +326,8 @@
result.baseAddr := baseAddr
result.elemStride := elemStride
result.segmentStride := segmentStride
+ result.nextSegmentVectorOffset := nextSegmentVectorOffset
+ result.nextLmulVectorRewind := nextLmulVectorRewind
val segmentBaseAddr = baseAddr + (segmentStride * segment)
@@ -382,6 +388,8 @@
result.elemWidth := elemWidth
result.nfields := nfields
result.segment := segment
+ result.nextSegmentVectorOffset := nextSegmentVectorOffset
+ result.nextLmulVectorRewind := nextLmulVectorRewind
result
}
@@ -405,6 +413,8 @@
result.segmentStride := segmentStride
result.elemWidth := elemWidth
result.nfields := nfields
+ result.nextSegmentVectorOffset := nextSegmentVectorOffset
+ result.nextLmulVectorRewind := nextLmulVectorRewind
val vectorWriteback = writeback && LsuOp.isVector(op)
@@ -447,7 +457,12 @@
(baseAddr + (nfields * 16.U) + 16.U),
// Indexed don't have base addr changed.
))
- result.rd := rd + 1.U // Move to next vector register
+ result.rd := MuxCase(rd, Seq(
+ // Finished one lmul, start from segment 0 again
+ lmulUpdate -> (rd - nextLmulVectorRewind + 1.U),
+ // Finished one segment, proceed to next
+ vectorWriteback -> (rd + nextSegmentVectorOffset),
+ ))
result
}
@@ -479,6 +494,8 @@
result.elemWidth := elemWidth
result.nfields := nfields
result.segment := segment
+ result.nextSegmentVectorOffset := nextSegmentVectorOffset
+ result.nextLmulVectorRewind := nextLmulVectorRewind
result
}
@@ -524,11 +541,17 @@
result.store := uop.store
result.pc := uop.pc
if (p.enableRvv) {
- result.lmul := uop.lmul.getOrElse(0.U)
- result.nfields := Mux(LsuOp.isVector(uop.op), uop.nfields.get, 0.U)
+ val lmul = uop.lmul.getOrElse(0.U)
+ result.lmul := lmul
+ val nfields = Mux(LsuOp.isVector(uop.op), uop.nfields.get, 0.U)
+ result.nfields := nfields
result.segment := 0.U
+ result.nextSegmentVectorOffset := lmul
+ result.nextLmulVectorRewind := nfields * lmul
} else {
result.lmul := 0.U
+ result.nextSegmentVectorOffset := 0.U
+ result.nextLmulVectorRewind := 0.U
}
// All vector ops require writeback. Lsu needs to inform RVV core store uop
diff --git a/tests/cocotb/BUILD b/tests/cocotb/BUILD
index 43b8c76..7efc947 100644
--- a/tests/cocotb/BUILD
+++ b/tests/cocotb/BUILD
@@ -187,10 +187,9 @@
"load8_segment2_unit_m1",
"load16_segment2_unit_m1",
"load32_segment2_unit_m1",
- # TODO(davidgao): re-enable once fixes are in.
- # "load8_segment2_unit_m2",
- # "load16_segment2_unit_m2",
- # "load32_segment2_unit_m2",
+ "load8_segment2_unit_m2",
+ "load16_segment2_unit_m2",
+ "load32_segment2_unit_m2",
"load8_segment2_stride6_m1",
"load16_segment2_stride6_m1",
"load8_indexed_m1",
diff --git a/tests/cocotb/rvv_load_store_test.py b/tests/cocotb/rvv_load_store_test.py
index 4395db2..f7d0b40 100644
--- a/tests/cocotb/rvv_load_store_test.py
+++ b/tests/cocotb/rvv_load_store_test.py
@@ -414,8 +414,7 @@
)
-# TODO(davidgao): re-enable once fixes are in.
-@cocotb.test(skip=True)
+@cocotb.test()
async def load8_segment2_unit_m2(dut):
await vector_load_store(
dut=dut,
@@ -427,8 +426,7 @@
)
-# TODO(davidgao): re-enable once fixes are in.
-@cocotb.test(skip=True)
+@cocotb.test()
async def load16_segment2_unit_m2(dut):
await vector_load_store(
dut=dut,
@@ -440,8 +438,7 @@
)
-# TODO(davidgao): re-enable once fixes are in.
-@cocotb.test(skip=True)
+@cocotb.test()
async def load32_segment2_unit_m2(dut):
await vector_load_store(
dut=dut,
@@ -517,10 +514,10 @@
make_test_case('vsseg2e8_v_u8mf2x2', vl=7, n_segs=2),
make_test_case('vsseg2e8_v_u8m1x2', vl=16, n_segs=2),
make_test_case('vsseg2e8_v_u8m1x2', vl=15, n_segs=2),
- # make_test_case('vsseg2e8_v_u8m2x2', vl=32, n_segs=2),
- # make_test_case('vsseg2e8_v_u8m2x2', vl=31, n_segs=2),
- # make_test_case('vsseg2e8_v_u8m4x2', vl=64, n_segs=2),
- # make_test_case('vsseg2e8_v_u8m4x2', vl=63, n_segs=2),
+ make_test_case('vsseg2e8_v_u8m2x2', vl=32, n_segs=2),
+ make_test_case('vsseg2e8_v_u8m2x2', vl=31, n_segs=2),
+ make_test_case('vsseg2e8_v_u8m4x2', vl=64, n_segs=2),
+ make_test_case('vsseg2e8_v_u8m4x2', vl=63, n_segs=2),
# Seg 3
make_test_case('vsseg3e8_v_u8mf4x3', vl=4, n_segs=3),
make_test_case('vsseg3e8_v_u8mf4x3', vl=3, n_segs=3),
@@ -528,8 +525,8 @@
make_test_case('vsseg3e8_v_u8mf2x3', vl=7, n_segs=3),
make_test_case('vsseg3e8_v_u8m1x3', vl=16, n_segs=3),
make_test_case('vsseg3e8_v_u8m1x3', vl=15, n_segs=3),
- # make_test_case('vsseg3e8_v_u8m2x3', vl=32, n_segs=3),
- # make_test_case('vsseg3e8_v_u8m2x3', vl=31, n_segs=3),
+ make_test_case('vsseg3e8_v_u8m2x3', vl=32, n_segs=3),
+ make_test_case('vsseg3e8_v_u8m2x3', vl=31, n_segs=3),
# Seg 4
make_test_case('vsseg4e8_v_u8mf4x4', vl=4, n_segs=4),
make_test_case('vsseg4e8_v_u8mf4x4', vl=3, n_segs=4),
@@ -537,8 +534,8 @@
make_test_case('vsseg4e8_v_u8mf2x4', vl=7, n_segs=4),
make_test_case('vsseg4e8_v_u8m1x4', vl=16, n_segs=4),
make_test_case('vsseg4e8_v_u8m1x4', vl=15, n_segs=4),
- # make_test_case('vsseg4e8_v_u8m2x4', vl=32, n_segs=4),
- # make_test_case('vsseg4e8_v_u8m2x4', vl=31, n_segs=4),
+ make_test_case('vsseg4e8_v_u8m2x4', vl=32, n_segs=4),
+ make_test_case('vsseg4e8_v_u8m2x4', vl=31, n_segs=4),
# Seg 5
make_test_case('vsseg5e8_v_u8mf4x5', vl=4, n_segs=5),
make_test_case('vsseg5e8_v_u8mf4x5', vl=3, n_segs=5),