[otbn] Fix BN.LID / BN.SID in ISS

Now, BN.LID / BN.SID load and store 256-bit little-endian values (so
byte 0 is the LSB; byte 31 is the MSB). As before (and as expected by
the RISC-V base ISA), the LW / SW instructions load and store 32-bit
little-endian values. Before this change, the code was a mess of
reversed() calls. After lots of debug prints, I think it's finally
both consistent and right!

This patch also updates the post-increment behaviour as discussed in
issue #3587 and expands the documentation a bit.

Signed-off-by: Rupert Swarbrick <rswarbrick@lowrisc.org>
diff --git a/hw/ip/otbn/data/bignum-insns.yml b/hw/ip/otbn/data/bignum-insns.yml
index a61ffe1..c2af187 100644
--- a/hw/ip/otbn/data/bignum-insns.yml
+++ b/hw/ip/otbn/data/bignum-insns.yml
@@ -779,15 +779,17 @@
   syntax: |
     <grd>[<grd_inc>], <offset>(<grs1>[<grs1_inc>])
   doc: |
-    Calculates a byte memory address by adding the offset to the value in the GPR `grs1`.
-    The value from this memory address is then copied into the WDR pointed to by the value in GPR `grd`.
+    Load a WLEN-bit little-endian value from data memory.
+
+    The load address is `offset` plus the value in the GPR `grs1`.
+    The loaded value is stored into the WDR given by the bottom 5 bits of the GPR `grd`.
 
     After the operation, either the value in the GPR `grs1`, or the value in `grd` can be optionally incremented.
 
-    - If `grs1_inc` is set, the value in `grs1` is incremented by the value WLEN/8 (one word).
-    - If `grd_inc` is set, the value in `grd` is incremented by the value 1.
+    - If `grs1_inc` is set, the value in `grs1` is incremented by value WLEN/8 (one word).
+    - If `grd_inc` is set, `grd` is updated to be `(*grd + 1) & 0x1f`.
 
-    The memory address must be aligned to WLEN bytes.
+    The memory address must be aligned to WLEN bits.
     Any address that is unaligned or is above the top of memory will result in an error (with error code `ErrCodeBadDataAddr`).
   cycles: 2
   decode: |
@@ -809,7 +811,7 @@
     if grs1_inc:
         GPR[rs1] = GPR[rs1] + (WLEN / 8)
     if grd_inc:
-        GPR[rd] = GPR[rd] + 1
+        GPR[rd] = (GPR[rd] + 1) & 0x1f
   lsu:
     type: mem-load
     target: [offset, grs1]
@@ -851,15 +853,17 @@
   syntax: |
     <grs2>[<grs2_inc>], <offset>(<grs1>[<grs1_inc>])
   doc: |
-    Calculates a byte memory address by adding the offset to the value in the GPR `grs1`.
-    The value from the WDR pointed to by `grs2` is then copied into the memory.
+    Store a WDR to memory as a WLEN-bit little-endian value.
+
+    The store address is `offset` plus the value in the GPR `grs1`.
+    The value to store is taken from the WDR given by the bottom 5 bits of the GPR `grs2`.
 
     After the operation, either the value in the GPR `grs1`, or the value in `grs2` can be optionally incremented.
 
     - If `grs1_inc` is set, the value in `grs1` is incremented by the value WLEN/8 (one word).
-    - If `grs2_inc` is set, the value in `grs2` is incremented by the value 1.
+    - If `grs2_inc` is set, the value in `grs2` is updated to be `(*grs2 + 1) & 0x1f`.
 
-    The memory address must be aligned to WLEN bytes.
+    The memory address must be aligned to WLEN bits.
     Any address that is unaligned or is above the top of memory will result in an error (with error code `ErrCodeBadDataAddr`).
   decode: |
     rs1 = UInt(grs1)
@@ -880,7 +884,7 @@
     if grs1_inc:
         GPR[rs1] = GPR[rs1] + (WLEN / 8)
     if grs2_inc:
-        GPR[rs2] = GPR[rs2] + 1
+        GPR[rs2] = (GPR[rs2] + 1) & 0x1f
   lsu:
     type: mem-store
     target: [offset, grs1]
diff --git a/hw/ip/otbn/dv/otbnsim/sim/dmem.py b/hw/ip/otbn/dv/otbnsim/sim/dmem.py
index a6bbaec..c97b0fe 100644
--- a/hw/ip/otbn/dv/otbnsim/sim/dmem.py
+++ b/hw/ip/otbn/dv/otbnsim/sim/dmem.py
@@ -61,7 +61,17 @@
         self.trace = []  # type: List[Trace]
 
     def _get_u32s(self, idx: int) -> List[int]:
-        '''Return the value at idx as 8 uint32's in little-endian order'''
+        '''Return the value at idx as 8 uint32's
+
+        These are ordered by increasing address, so the first word from
+        _get_u32s(0) will correspond to bytes at addresses 0x0 through 0x3.
+
+        These uint32's are the words that get loaded by word accesses to
+        memory. Since these accesses are also little-endian, the byte at memory
+        address 0x0 (which holds the LSB for the first 256-bit value) will also
+        be the LSB of the first u32 returned by _get_u32s(0).
+
+        '''
         assert 0 <= idx < len(self.data)
 
         word = self.data[idx]
@@ -73,7 +83,8 @@
         ret = []
         w32_mask = (1 << 32) - 1
         for subidx in range(8):
-            ret.append((word >> (32 * (7 - subidx))) & w32_mask)
+            ret.append((word >> (32 * subidx)) & w32_mask)
+
         return ret
 
     def _set_u32s(self, idx: int, u32s: List[int]) -> None:
@@ -92,12 +103,7 @@
         self.data[idx] = u256
 
     def load_le_words(self, data: bytes) -> None:
-        '''Replace the start of memory with data
-
-        This data should be in the form of little-endian 32-bit words. These
-        words are themselves packed little-endian into 256-bit words.
-
-        '''
+        '''Replace the start of memory with data'''
         if len(data) > 32 * len(self.data):
             raise ValueError('Trying to load {} bytes of data, but DMEM '
                              'is only {} bytes long.'
@@ -112,9 +118,6 @@
         for idx32, u32 in enumerate(struct.iter_unpack('<I', data)):
             acc.append(u32[0])
             if len(acc) == 8:
-                # Reverse acc here, because we're also little-endian moving
-                # from u32 to u256
-                acc.reverse()
                 self._set_u32s(idx32 // 8, acc)
                 acc = []
 
@@ -131,18 +134,16 @@
         '''
         u32s = []  # type: List[int]
         for idx in range(len(self.data)):
-            # As in load_le_words, we also have to reverse each set of 8 words
-            # because we're little-endian at this scale too.
-            u32s += reversed(self._get_u32s(idx))
+            u32s += self._get_u32s(idx)
         return struct.pack('<{}I'.format(len(u32s)), *u32s)
 
     def load_u256(self, addr: int) -> int:
-        '''Read an unsigned wide word from memory. addr should be aligned.'''
+        '''Read a u256 little-endian value from an aligned address'''
         assert 0 == addr % 32
         return self.data[addr // 32]
 
     def store_u256(self, addr: int, value: int) -> None:
-        '''Write an unsigned wide word to memory. addr should be aligned.'''
+        '''Write a u256 little-endian value to an aligned address'''
         assert 0 == addr % 32
         assert 0 <= value < (1 << 256)
         self.trace.append(TraceDmemStore(addr, value, True))
diff --git a/hw/ip/otbn/dv/otbnsim/sim/insn.py b/hw/ip/otbn/dv/otbnsim/sim/insn.py
index 4f048f1..d290b63 100644
--- a/hw/ip/otbn/dv/otbnsim/sim/insn.py
+++ b/hw/ip/otbn/dv/otbnsim/sim/insn.py
@@ -798,7 +798,7 @@
         state.wdrs.get_reg(wrd).write_unsigned(value)
 
         if self.grd_inc:
-            new_grd_val = (grd_val + 1) & ((1 << 32) - 1)
+            new_grd_val = (grd_val + 1) & 0x1f
             state.gprs.get_reg(self.grd).write_unsigned(new_grd_val)
 
         if self.grs1_inc:
@@ -832,7 +832,7 @@
             state.gprs.get_reg(self.grs1).write_unsigned(new_grs1_val)
 
         if self.grs2_inc:
-            new_grs2_val = (grs2_val + 1) & ((1 << 32) - 1)
+            new_grs2_val = (grs2_val + 1) & 0x1f
             state.gprs.get_reg(self.grs2).write_unsigned(new_grs2_val)