[otbn] Add support for base load and store

Signed-off-by: Greg Chadwick <gac@lowrisc.org>
diff --git a/hw/ip/otbn/dv/smoke/smoke_expected.txt b/hw/ip/otbn/dv/smoke/smoke_expected.txt
index f1c24f3..cb07e49 100644
--- a/hw/ip/otbn/dv/smoke/smoke_expected.txt
+++ b/hw/ip/otbn/dv/smoke/smoke_expected.txt
@@ -17,8 +17,8 @@
 14  | 0x4c000000
 15  | 0x00000034
 16  | 0xfffffff4
-17  | 0x00000000
-18  | 0x00000000
+17  | 0xfacefeed
+18  | 0xd0beb533
 19  | 0x00000000
 20  | 0x00000000
 21  | 0x00000000
diff --git a/hw/ip/otbn/dv/smoke/smoke_test.S b/hw/ip/otbn/dv/smoke/smoke_test.S
index 994f382..f1db4f3 100644
--- a/hw/ip/otbn/dv/smoke/smoke_test.S
+++ b/hw/ip/otbn/dv/smoke/smoke_test.S
@@ -5,6 +5,8 @@
 # OTBN Smoke test, runs various instructions which are expected to produce the
 # final register state see in smoke_expected.txt
 
+.section .text
+
 # x1 = 0xd0beb513
 lui x1, 0xd0beb
 add x1, x1, 0x513
@@ -54,4 +56,20 @@
 # x16 = x1 >>> x2[4:0] = 0xfffffff4
 sra x16, x1, x2
 
+# x17 = dmem[16] = 0xfacefeed
+lw x17, 16(x0)
+
+# dmem[4] = x8 = 0xd0beb533
+sw x8, 4(x0)
+
+# x18 = dmem[4] = 0xd0beb533
+lw x18, 4(x0)
+
 ecall
+
+.section .data
+.word 0x1234abcd
+.word 0xbaadf00d
+.word 0xcafed00d
+.word 0xdeadbeef
+.word 0xfacefeed
diff --git a/hw/ip/otbn/lint/otbn.vlt b/hw/ip/otbn/lint/otbn.vlt
index 4965640..dabd103 100644
--- a/hw/ip/otbn/lint/otbn.vlt
+++ b/hw/ip/otbn/lint/otbn.vlt
@@ -8,3 +8,6 @@
 // complete
 lint_off -rule UNUSED -file "*/rtl/otbn_*"
 
+// Operator EQ expects 32 bits on the LHS, but LHS's VARREF 'addr' generates 3
+// bits.
+lint_off -rule WIDTH -file "*/rtl/otbn_lsu.sv" -match "*'addr' generates 3 bits*"
diff --git a/hw/ip/otbn/rtl/otbn_controller.sv b/hw/ip/otbn/rtl/otbn_controller.sv
index 0d42984..d9fd588 100644
--- a/hw/ip/otbn/rtl/otbn_controller.sv
+++ b/hw/ip/otbn/rtl/otbn_controller.sv
@@ -52,40 +52,90 @@
   output alu_base_operation_t  alu_base_operation_o,
   output alu_base_comparison_t alu_base_comparison_o,
   input  logic [31:0]          alu_base_operation_result_i,
-  input  logic                 alu_base_comparison_result_i
+  input  logic                 alu_base_comparison_result_i,
+
+  output logic                     lsu_load_req_o,
+  output logic                     lsu_store_req_o,
+  output insn_subset_e             lsu_req_subset_o,
+  output logic [DmemAddrWidth-1:0] lsu_addr_o,
+
+  output logic [31:0]              lsu_base_wdata_o,
+  output logic [WLEN-1:0]          lsu_bignum_wdata_o,
+
+  input  logic [31:0]              lsu_base_rdata_i,
+  input  logic [WLEN-1:0]          lsu_bignum_rdata_i,
+  input  logic [1:0]               lsu_rdata_err_i // Bit1: Uncorrectable, Bit0: Correctable
 );
 
+  typedef enum logic [1:0] {
+    OtbnStateHalt,
+    OtbnStateRun,
+    OtbnStateStall
+  } otbn_state_e;
 
-  logic running_q, running_d;
+  otbn_state_e state_q, state_d;
 
+  logic stall;
+  logic mem_stall;
+
+  // Stall a cycle on loads to allow load data writeback to happen the following cycle. Stall not
+  // required on stores as there is no response to deal with.
+  // TODO: Possibility of error response on store? Probably still don't need to stall in that case
+  // just ensure incoming store error stops anything else happening.
+  assign mem_stall = lsu_load_req_o;
+
+  assign stall = mem_stall;
   assign done_o = insn_valid_i && insn_dec_ctrl_i.ecall_insn;
 
   always_comb begin
-    running_d              = running_q;
+    state_d                = state_q;
     insn_fetch_req_valid_o = 1'b0;
     insn_fetch_req_addr_o  = start_addr_i;
 
-    if (start_i) begin
-      running_d = 1'b1;
-      insn_fetch_req_addr_o  = start_addr_i;
-      insn_fetch_req_valid_o = 1'b01;
-    end else if (running_q) begin
-      insn_fetch_req_valid_o = 1'b1;
-      insn_fetch_req_addr_o  = insn_addr_i + 'd4;
-    end
-
-    if (done_o) begin
-      running_d              = 1'b0;
-      insn_fetch_req_valid_o = 1'b0;
-    end
+    // TODO: Harden state machine
     // TODO: Jumps/branches
+    unique case (state_q)
+      OtbnStateHalt: begin
+        if (start_i) begin
+          state_d = OtbnStateRun;
+          insn_fetch_req_addr_o  = start_addr_i;
+          insn_fetch_req_valid_o = 1'b01;
+        end
+      end
+      OtbnStateRun: begin
+        insn_fetch_req_valid_o = 1'b1;
+
+        if (done_o) begin
+          state_d                = OtbnStateHalt;
+          insn_fetch_req_valid_o = 1'b0;
+        end else begin
+          // When stalling refetch the same instruction to keep decode inputs constant
+          if (stall) begin
+            state_d = OtbnStateStall;
+            insn_fetch_req_addr_o = insn_addr_i;
+          end else begin
+            insn_fetch_req_addr_o = insn_addr_i + 'd4;
+          end
+        end
+      end
+      OtbnStateStall: begin
+        // Only ever stall for a single cycle
+        // TODO: Any more than one cycle stall cases?
+        insn_fetch_req_valid_o = 1'b1;
+        insn_fetch_req_addr_o  = insn_addr_i + 'd4;
+        state_d = OtbnStateRun;
+      end
+      default: ;
+    endcase
   end
 
+  `ASSERT(ControllerStateValid, state_q inside {OtbnStateHalt, OtbnStateRun, OtbnStateStall});
+
   always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
-      running_q <= 1'b0;
+      state_q <= OtbnStateHalt;
     end else begin
-      running_q <= running_d;
+      state_q <= state_d;
     end
   end
 
@@ -123,8 +173,28 @@
 
   // Register file write MUX
   // TODO: Switch between CSR/ALU/LSU writeback.
-  assign rf_base_wr_en_o   = insn_dec_ctrl_i.rf_we;
-  assign rf_base_wr_addr_o = insn_dec_base_i.d;
-  assign rf_base_wr_data_o = alu_base_operation_result_i;
+  assign rf_base_wr_en_o =
+   insn_dec_ctrl_i.rf_we | (insn_dec_ctrl_i.ld_insn & (state_q == OtbnStateStall));
 
+  assign rf_base_wr_addr_o = insn_dec_base_i.d;
+
+  always_comb begin
+    rf_base_wr_data_o = alu_base_operation_result_i;
+
+    unique case (1'b1)
+      insn_dec_ctrl_i.ld_insn: rf_base_wr_data_o = lsu_base_rdata_i;
+      default: ;
+    endcase
+  end
+
+  // TODO: Add error on unaligned/out of bounds
+  assign lsu_load_req_o   = insn_valid_i & insn_dec_ctrl_i.ld_insn & (state_q == OtbnStateRun);
+  assign lsu_store_req_o  = insn_valid_i & insn_dec_ctrl_i.st_insn & (state_q == OtbnStateRun);
+  assign lsu_req_subset_o = insn_dec_ctrl_i.subset;
+  // TODO: Switch between address from base/bignum
+  assign lsu_addr_o       = alu_base_operation_result_i[DmemAddrWidth-1:0];
+  assign lsu_base_wdata_o = rf_base_rd_data_b_i;
+
+  // TODO: Bignum load/store
+  assign lsu_bignum_wdata_o = '0;
 endmodule
diff --git a/hw/ip/otbn/rtl/otbn_core.sv b/hw/ip/otbn/rtl/otbn_core.sv
index b44ba6a..2641858 100644
--- a/hw/ip/otbn/rtl/otbn_core.sv
+++ b/hw/ip/otbn/rtl/otbn_core.sv
@@ -83,6 +83,18 @@
   logic [31:0]          alu_base_operation_result;
   logic                 alu_base_comparison_result;
 
+  logic                     lsu_load_req;
+  logic                     lsu_store_req;
+  insn_subset_e             lsu_req_subset;
+  logic [DmemAddrWidth-1:0] lsu_addr;
+
+  logic [31:0]              lsu_base_wdata;
+  logic [WLEN-1:0]          lsu_bignum_wdata;
+
+  logic [31:0]              lsu_base_rdata;
+  logic [WLEN-1:0]          lsu_bignum_rdata;
+  logic [1:0]               lsu_rdata_err; // Bit1: Uncorrectable, Bit0: Correctable
+
   // Depending on its usage, the instruction address (program counter) is qualified by two valid
   // signals: insn_fetch_resp_valid (together with the undecoded instruction data), and insn_valid
   // for valid decoded (i.e. legal) instructions. Duplicate the signal in the source code for
@@ -170,7 +182,19 @@
     .alu_base_operation_o         (alu_base_operation),
     .alu_base_comparison_o        (alu_base_comparison),
     .alu_base_operation_result_i  (alu_base_operation_result),
-    .alu_base_comparison_result_i (alu_base_comparison_result)
+    .alu_base_comparison_result_i (alu_base_comparison_result),
+
+    .lsu_load_req_o     (lsu_load_req),
+    .lsu_store_req_o    (lsu_store_req),
+    .lsu_req_subset_o   (lsu_req_subset),
+    .lsu_addr_o         (lsu_addr),
+
+    .lsu_base_wdata_o   (lsu_base_wdata),
+    .lsu_bignum_wdata_o (lsu_bignum_wdata),
+
+    .lsu_base_rdata_i   (lsu_base_rdata),
+    .lsu_bignum_rdata_i (lsu_bignum_rdata),
+    .lsu_rdata_err_i    (lsu_rdata_err)
   );
 
   // Load store unit: read and write data from data memory
@@ -186,10 +210,19 @@
     .dmem_wmask_o,
     .dmem_rdata_i,
     .dmem_rvalid_i,
-    .dmem_rerror_i
+    .dmem_rerror_i,
 
-    // Data from base and bn execute blocks.
-    // TODO: Add signals to controller
+    .lsu_load_req_i     (lsu_load_req),
+    .lsu_store_req_i    (lsu_store_req),
+    .lsu_req_subset_i   (lsu_req_subset),
+    .lsu_addr_i         (lsu_addr),
+
+    .lsu_base_wdata_i   (lsu_base_wdata),
+    .lsu_bignum_wdata_i (lsu_bignum_wdata),
+
+    .lsu_base_rdata_o   (lsu_base_rdata),
+    .lsu_bignum_rdata_o (lsu_bignum_rdata),
+    .lsu_rdata_err_o    (lsu_rdata_err)
   );
 
   // Control and Status registers
diff --git a/hw/ip/otbn/rtl/otbn_decoder.sv b/hw/ip/otbn/rtl/otbn_decoder.sv
index 8cd4814..9956cd1 100644
--- a/hw/ip/otbn/rtl/otbn_decoder.sv
+++ b/hw/ip/otbn/rtl/otbn_decoder.sv
@@ -87,6 +87,8 @@
   rf_wd_sel_e rf_wdata_sel;
 
   logic ecall_insn;
+  logic ld_insn;
+  logic st_insn;
 
   // Reduced main ALU immediate MUX for Operand B
   logic [31:0] imm_b;
@@ -116,7 +118,9 @@
     alu_op:       alu_operator,
     rf_we:        rf_we,
     rf_wdata_sel: rf_wdata_sel,
-    ecall_insn:   ecall_insn
+    ecall_insn:   ecall_insn,
+    ld_insn:      ld_insn,
+    st_insn:      st_insn
   };
 
   /////////////
@@ -131,6 +135,8 @@
 
     illegal_insn          = 1'b0;
     ecall_insn            = 1'b0;
+    ld_insn               = 1'b0;
+    st_insn               = 1'b0;
 
     opcode                = insn_opcode_e'(insn[6:0]);
 
@@ -204,6 +210,34 @@
         end
       end
 
+      //////////////////
+      // Loads/Stores //
+      //////////////////
+
+      InsnOpcodeBaseLoad: begin
+        insn_subset = InsnSubsetBase;
+        rf_ren_a    = 1'b1;
+        ld_insn     = 1'b1;
+
+        if (insn[14:12] != 3'b010) begin
+          illegal_insn = 1'b1;
+        end
+      end
+
+      InsnOpcodeBaseStore: begin
+        insn_subset = InsnSubsetBase;
+        rf_ren_a    = 1'b1;
+        rf_ren_b    = 1'b1;
+        st_insn     = 1'b1;
+
+        if (insn[14:12] != 3'b010) begin
+          illegal_insn = 1'b1;
+        end
+      end
+
+      /////////////
+      // Special //
+      /////////////
 
       InsnOpcodeBaseSystem: begin
         insn_subset = InsnSubsetBase;
@@ -319,6 +353,24 @@
         end
       end
 
+      //////////////////
+      // Loads/Stores //
+      //////////////////
+
+      InsnOpcodeBaseLoad: begin
+        alu_op_a_mux_sel = OpASelRegister;
+        alu_op_b_mux_sel = OpBSelImmediate;
+        alu_operator     = AluOpAdd;
+        imm_b_mux_sel    = ImmBI;
+      end
+
+      InsnOpcodeBaseStore: begin
+        alu_op_a_mux_sel = OpASelRegister;
+        alu_op_b_mux_sel = OpBSelImmediate;
+        alu_operator     = AluOpAdd;
+        imm_b_mux_sel    = ImmBS;
+      end
+
       /////////////
       // Special //
       /////////////
diff --git a/hw/ip/otbn/rtl/otbn_lsu.sv b/hw/ip/otbn/rtl/otbn_lsu.sv
index d00fcd9..e7d2f81 100644
--- a/hw/ip/otbn/rtl/otbn_lsu.sv
+++ b/hw/ip/otbn/rtl/otbn_lsu.sv
@@ -32,15 +32,73 @@
   output logic [WLEN-1:0]          dmem_wmask_o,
   input  logic [WLEN-1:0]          dmem_rdata_i,
   input  logic                     dmem_rvalid_i,
-  input  logic [1:0]               dmem_rerror_i // Bit1: Uncorrectable, Bit0: Correctable
+  input  logic [1:0]               dmem_rerror_i, // Bit1: Uncorrectable, Bit0: Correctable
 
+  input  logic                     lsu_load_req_i,
+  input  logic                     lsu_store_req_i,
+  input  insn_subset_e             lsu_req_subset_i,
+  input  logic [DmemAddrWidth-1:0] lsu_addr_i,
+
+  input  logic [31:0]              lsu_base_wdata_i,
+  input  logic [WLEN-1:0]          lsu_bignum_wdata_i,
+
+  output logic [31:0]              lsu_base_rdata_o,
+  output logic [WLEN-1:0]          lsu_bignum_rdata_o,
+  output logic [1:0]               lsu_rdata_err_o // Bit1: Uncorrectable, Bit0: Correctable
 );
+  localparam int BaseWordsPerWLen = WLEN / 32;
+  localparam int BaseWordAddrW = prim_util_pkg::vbits(WLEN/8);
 
-  // tie-off to 0 to prevent warnings until implementation is done
-  assign dmem_req_o = 1'b0;
-  assign dmem_write_o = 1'b0;
-  assign dmem_addr_o = '0;
-  assign dmem_wdata_o = '0;
-  assign dmem_wmask_o = '0;
+  // Produce a WLEN bit mask for 32-bit writes given the 32-bit word write address. This doesn't
+  // propagate X so a seperate assertion must be used to check the input isn't X when a valid output
+  // is desired.
+  function automatic logic [WLEN-1:0] mask_from_word_addr(logic [BaseWordAddrW-1:2] addr);
+    logic [WLEN-1:0] mask;
+
+    mask = '0;
+
+    // Use of logic == int comparison in this loop works as BaseWordsPerWLen is a constant, so the
+    // loop can be unrolled. Due to the use of '==' any X or Z in addr will result in an X result
+    // for the comparison (so mask will remain 0).
+    for (int i = 0; i < BaseWordsPerWLen; i++) begin
+      if (addr == i) begin
+        mask[i*32+:32] = 32'hFFFFFFFF;
+      end
+    end
+
+    return mask;
+  endfunction
+
+  assign dmem_req_o   = lsu_load_req_i | lsu_store_req_i;
+  assign dmem_write_o = lsu_store_req_i;
+  assign dmem_addr_o  = lsu_addr_i;
+
+  // For base 32-bit writes replicate write data across dmem_wdata. dmem_wmask will be set
+  // appropriately so only the target word is written.
+  assign dmem_wdata_o = lsu_req_subset_i == InsnSubsetBase ?
+    {BaseWordsPerWLen{lsu_base_wdata_i}} : lsu_bignum_wdata_i;
+
+  assign dmem_wmask_o = lsu_req_subset_i == InsnSubsetBase ?
+    mask_from_word_addr(lsu_addr_i[BaseWordAddrW-1:2]) : {WLEN{1'b1}};
+
+  // From the WLEN word read from DMem select out a 32-bit word for base instructions.
+  for (genvar i_bit = 0; i_bit < 32; i_bit++) begin : g_base_rdata
+    logic [BaseWordsPerWLen-1:0] bit_mux;
+
+    for (genvar j_word = 0; j_word < BaseWordsPerWLen; j_word++) begin : g_bit_mux
+      assign bit_mux[j_word] =
+        (lsu_addr_i[BaseWordAddrW-1:2] == j_word) & dmem_rdata_i[i_bit + j_word * 32];
+    end
+
+    assign lsu_base_rdata_o[i_bit] = |bit_mux;
+  end
+
+  // Data appears the cycle following the request, LSU assume lsu_addr_i is kept stable by the
+  // controller to mux out the required 32-bit word.
+  `ASSERT(LsuLoadAddrStable, lsu_load_req_i |=> $stable(lsu_addr_i));
+  `ASSERT_KNOWN_IF(LsuAddrKnown, lsu_addr_i, lsu_load_req_i | lsu_store_req_i);
+
+  assign lsu_bignum_rdata_o = dmem_rdata_i;
+  assign lsu_rdata_err_o    = dmem_rerror_i;
 
 endmodule
diff --git a/hw/ip/otbn/rtl/otbn_pkg.sv b/hw/ip/otbn/rtl/otbn_pkg.sv
index 96bded4..288b476 100644
--- a/hw/ip/otbn/rtl/otbn_pkg.sv
+++ b/hw/ip/otbn/rtl/otbn_pkg.sv
@@ -163,6 +163,8 @@
     logic         rf_we;
     rf_wd_sel_e   rf_wdata_sel;
     logic         ecall_insn;
+    logic         ld_insn;
+    logic         st_insn;
   } insn_dec_ctrl_t;
 
   typedef struct packed {