[tlul] Add byte level handling for integrity

Signed-off-by: Timothy Chen <timothytim@google.com>
diff --git a/hw/ip/tlul/adapter_sram.core b/hw/ip/tlul/adapter_sram.core
index aad1537..78d1aca 100644
--- a/hw/ip/tlul/adapter_sram.core
+++ b/hw/ip/tlul/adapter_sram.core
@@ -12,6 +12,7 @@
       - lowrisc:prim:assert
       - lowrisc:tlul:common
     files:
+      - rtl/tlul_sram_byte.sv
       - rtl/tlul_adapter_sram.sv
     file_type: systemVerilogSource
 
diff --git a/hw/ip/tlul/rtl/tlul_adapter_sram.sv b/hw/ip/tlul/rtl/tlul_adapter_sram.sv
index de4c9c4..ae89e12 100644
--- a/hw/ip/tlul/rtl/tlul_adapter_sram.sv
+++ b/hw/ip/tlul/rtl/tlul_adapter_sram.sv
@@ -61,6 +61,47 @@
   localparam int WoffsetWidth = (SramByte == top_pkg::TL_DBW) ? 1 :
                                 DataBitWidth - prim_util_pkg::vbits(top_pkg::TL_DBW);
 
+  // integrity check
+  logic intg_error;
+
+  if (CmdIntgCheck) begin : gen_cmd_intg_check
+    tlul_cmd_intg_chk u_cmd_intg_chk (
+      .tl_i(tl_i),
+      .err_o (intg_error)
+    );
+  end else begin : gen_no_cmd_intg_check
+    assign intg_error = '0;
+  end
+
+  // permanently latch integrity error until reset
+  logic intg_error_q;
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      intg_error_q <= '0;
+    end else if (intg_error) begin
+      intg_error_q <= 1'b1;
+    end
+  end
+
+  // integrity error output is permanent and should be used for alert generation
+  // or other downstream effects
+  assign intg_error_o = intg_error | intg_error_q;
+
+  // byte handling for integrity
+  tl_h2d_t tl_i_int;
+  tl_d2h_t tl_o_int;
+  tlul_sram_byte #(
+    .EnableIntg(ByteAccess & CmdIntgCheck & !ErrOnWrite)
+  ) u_sram_byte (
+    .clk_i,
+    .rst_ni,
+    .tl_i,
+    .tl_o,
+    .tl_sram_o(tl_i_int),
+    .tl_sram_i(tl_o_int),
+    .error_i(intg_error)
+  );
+
   typedef struct packed {
     logic [top_pkg::TL_DBW-1:0] mask ; // Byte mask within the TL-UL word
     logic [WoffsetWidth-1:0]    woffset ; // Offset of the TL-UL word within the SRAM word
@@ -106,7 +147,6 @@
   rsp_t rspfifo_wdata,  rspfifo_rdata;
 
   logic error_internal; // Internal protocol error checker
-  logic intg_error;
   logic wr_attr_error;
   logic instr_error;
   logic wr_vld_error;
@@ -114,8 +154,8 @@
   logic tlul_error;     // Error from `tlul_err` module
 
   logic a_ack, d_ack, sram_ack;
-  assign a_ack    = tl_i.a_valid & tl_o.a_ready ;
-  assign d_ack    = tl_o.d_valid & tl_i.d_ready ;
+  assign a_ack    = tl_i_int.a_valid & tl_o_int.a_ready ;
+  assign d_ack    = tl_o_int.d_valid & tl_i_int.d_ready ;
   assign sram_ack = req_o        & gnt_i ;
 
   // Valid handling
@@ -175,7 +215,7 @@
     .EnableDataIntgGen(EnableDataIntgGen)
   ) u_rsp_gen (
     .tl_i(tl_out),
-    .tl_o
+    .tl_o(tl_o_int)
   );
 
   // a_ready depends on the FIFO full condition and grant from SRAM (or SRAM arbiter)
@@ -185,17 +225,17 @@
   //    Generate request only when no internal error occurs. If error occurs, the request should be
   //    dropped and returned error response to the host. So, error to be pushed to reqfifo.
   //    In this case, it is assumed the request is granted (may cause ordering issue later?)
-  assign req_o      = tl_i.a_valid & reqfifo_wready & ~error_internal;
-  assign req_type_o = tl_i.a_user.tl_type;
-  assign we_o       = tl_i.a_valid & (tl_i.a_opcode inside {PutFullData, PutPartialData});
-  assign addr_o     = (tl_i.a_valid) ? tl_i.a_address[DataBitWidth+:SramAw] : '0;
+  assign req_o      = tl_i_int.a_valid & reqfifo_wready & ~error_internal;
+  assign req_type_o = tl_i_int.a_user.tl_type;
+  assign we_o       = tl_i_int.a_valid & (tl_i_int.a_opcode inside {PutFullData, PutPartialData});
+  assign addr_o     = (tl_i_int.a_valid) ? tl_i_int.a_address[DataBitWidth+:SramAw] : '0;
 
   // Support SRAMs wider than the TL-UL word width by mapping the parts of the
   // TL-UL address which are more fine-granular than the SRAM width to the
   // SRAM write mask.
   logic [WoffsetWidth-1:0] woffset;
   if (top_pkg::TL_DW != SramDw) begin : gen_wordwidthadapt
-    assign woffset = tl_i.a_address[DataBitWidth-1:prim_util_pkg::vbits(top_pkg::TL_DBW)];
+    assign woffset = tl_i_int.a_address[DataBitWidth-1:prim_util_pkg::vbits(top_pkg::TL_DBW)];
   end else begin : gen_no_wordwidthadapt
     assign woffset = '0;
   end
@@ -221,10 +261,10 @@
     wmask_int = '0;
     wdata_int = '0;
 
-    if (tl_i.a_valid) begin
+    if (tl_i_int.a_valid) begin
       for (int i = 0 ; i < top_pkg::TL_DW/8 ; i++) begin
-        wmask_int[woffset][8*i +: 8] = {8{tl_i.a_mask[i]}};
-        wdata_int[woffset][8*i +: 8] = (tl_i.a_mask[i] && we_o) ? tl_i.a_data[8*i+:8] : '0;
+        wmask_int[woffset][8*i +: 8] = {8{tl_i_int.a_mask[i]}};
+        wdata_int[woffset][8*i +: 8] = (tl_i_int.a_mask[i] && we_o) ? tl_i_int.a_data[8*i+:8] : '0;
       end
     end
   end
@@ -238,9 +278,9 @@
     wmask_intg  = '0;
     wdata_intg  = '0;
 
-    if (tl_i.a_valid) begin
+    if (tl_i_int.a_valid) begin
       wmask_intg[woffset] = '1;
-      wdata_intg[woffset] = tl_i.a_user.data_intg;
+      wdata_intg[woffset] = tl_i_int.a_user.data_intg;
     end
   end
 
@@ -265,54 +305,31 @@
   // wr_attr_error: Check if the request size,mask are permitted.
   //    Basic check of size, mask, addr align is done in tlul_err module.
   //    Here it checks any partial write if ByteAccess isn't allowed.
-  assign wr_attr_error = (tl_i.a_opcode == PutFullData || tl_i.a_opcode == PutPartialData)
-                         ? ((ByteAccess == 0) ? (tl_i.a_mask != '1 || tl_i.a_size != 2'h2) : 1'b0)
-                         : 1'b0;
+  assign wr_attr_error = (tl_i_int.a_opcode == PutFullData || tl_i_int.a_opcode == PutPartialData)
+                         ? ((ByteAccess == 0) ?
+                           (tl_i_int.a_mask != '1 || tl_i_int.a_size != 2'h2) : 1'b0)
+                           : 1'b0;
 
   // An instruction type transaction is only valid if en_ifetch is enabled
-  assign instr_error = tl_i.a_user.tl_type == InstrType &
+  assign instr_error = tl_i_int.a_user.tl_type == InstrType &
                        en_ifetch_i != InstrEn;
 
   if (ErrOnWrite == 1) begin : gen_no_writes
-    assign wr_vld_error = tl_i.a_opcode != Get;
+    assign wr_vld_error = tl_i_int.a_opcode != Get;
   end else begin : gen_writes_allowed
     assign wr_vld_error = 1'b0;
   end
 
   if (ErrOnRead == 1) begin: gen_no_reads
-    assign rd_vld_error = tl_i.a_opcode == Get;
+    assign rd_vld_error = tl_i_int.a_opcode == Get;
   end else begin : gen_reads_allowed
     assign rd_vld_error = 1'b0;
   end
 
-  if (CmdIntgCheck) begin : gen_cmd_intg_check
-    tlul_cmd_intg_chk u_cmd_intg_chk (
-      .tl_i,
-      .err_o (intg_error)
-    );
-  end else begin : gen_no_cmd_intg_check
-    assign intg_error = '0;
-  end
-
-
-  // permanently latch integrity error until reset
-  logic intg_error_q;
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      intg_error_q <= '0;
-    end else if (intg_error) begin
-      intg_error_q <= 1'b1;
-    end
-  end
-
-  // integrity error output is permanent and should be used for alert generation
-  // or other downstream effects
-  assign intg_error_o = intg_error | intg_error_q;
-
   tlul_err u_err (
     .clk_i,
     .rst_ni,
-    .tl_i,
+    .tl_i(tl_i_int),
     .err_o (tlul_error)
   );
 
@@ -323,16 +340,16 @@
 
   assign reqfifo_wvalid = a_ack ; // Push to FIFO only when granted
   assign reqfifo_wdata  = '{
-    op:     (tl_i.a_opcode != Get) ? OpWrite : OpRead, // To return AccessAck for opcode error
+    op:     (tl_i_int.a_opcode != Get) ? OpWrite : OpRead, // To return AccessAck for opcode error
     error:  error_internal,
-    size:   tl_i.a_size,
-    source: tl_i.a_source
+    size:   tl_i_int.a_size,
+    source: tl_i_int.a_source
   }; // Store the request only. Doesn't have to store data
   assign reqfifo_rready = d_ack ;
 
   // push together with ReqFIFO, pop upon returning read
   assign sramreqfifo_wdata = '{
-    mask    : tl_i.a_mask,
+    mask    : tl_i_int.a_mask,
     woffset : woffset
   };
   assign sramreqfifo_wvalid = sram_ack & ~we_o;
diff --git a/hw/ip/tlul/rtl/tlul_sram_byte.sv b/hw/ip/tlul/rtl/tlul_sram_byte.sv
new file mode 100644
index 0000000..1bbfa36
--- /dev/null
+++ b/hw/ip/tlul/rtl/tlul_sram_byte.sv
@@ -0,0 +1,251 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+`include "prim_assert.sv"
+
+/**
+ * Tile-Link UL adapter for SRAM-like devices
+ *
+ * This module handles byte writes for tlul integrity.
+ * When a byte write is received, the downstream data is read first
+ * to correctly create the integrity constant.
+ *
+ * A tlul transaction goes through this module.  If required, a
+ * tlul read transaction is generated out first.  If not required, the
+ * incoming tlul transaction is directly muxed out.
+ */
+module tlul_sram_byte import tlul_pkg::*; #(
+  parameter int EnableIntg  = 0  // Enable integrity handling at byte level
+) (
+  input clk_i,
+  input rst_ni,
+
+  input tl_h2d_t tl_i,
+  output tl_d2h_t tl_o,
+
+  output tl_h2d_t tl_sram_o,
+  input tl_d2h_t tl_sram_i,
+
+  // if incoming transaction already has an integrity error, do not
+  // attempt to handle the byte-write access.  Instead treat as
+  // feedthrough and allow the system to directly error back
+  input error_i
+);
+
+  // state enumeration
+  typedef enum logic [1:0] {
+    StPassThru,
+    StWaitRd,
+    StWriteCmd
+  } state_e;
+
+  // signal select enumeration
+  typedef enum logic [1:0] {
+    SelPassThru = 2'b01,
+    SelInt = 2'b10
+  } sel_sig_e;
+
+  // state and selection
+  sel_sig_e sel_int;
+  logic stall_host;
+  logic wr_phase;
+  state_e state_d, state_q;
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      state_q <= StPassThru;
+    end else begin
+      state_q <= state_d;
+    end
+  end
+
+  // transaction qualifying signals
+  logic a_ack;  // upstream a channel acknowledgement
+  logic sram_a_ack; // downstream a channel acknowledgement
+  logic sram_d_ack; // downstream d channel acknowledgement
+  logic wr_txn;
+  logic byte_wr_txn;
+  logic byte_req_ack;
+
+  assign a_ack = tl_i.a_valid & tl_o.a_ready;
+  assign sram_a_ack = tl_sram_o.a_valid & tl_sram_i.a_ready;
+  assign sram_d_ack = tl_sram_i.d_valid & tl_sram_o.d_ready;
+  assign wr_txn = (tl_i.a_opcode == PutFullData) | (tl_i.a_opcode == PutPartialData);
+  assign byte_req_ack = byte_wr_txn & a_ack & ~error_i;
+
+  // when to select internal signals instead of passthru
+  if (EnableIntg == 1) begin : gen_dyn_sel
+    assign byte_wr_txn = tl_i.a_valid & ~&tl_i.a_mask & wr_txn & (EnableIntg == 1);
+    assign sel_int = byte_wr_txn | stall_host ? SelInt : SelPassThru;
+  end else begin : gen_static_sel
+    assign byte_wr_txn = '0;
+    assign sel_int = SelPassThru;
+  end
+
+  // state machine handling
+  always_comb begin
+    stall_host = 1'b0;
+    wr_phase = 1'b0;
+    state_d = state_q;
+
+    unique case (state_q)
+      StPassThru: begin
+
+        if (byte_req_ack) begin
+          state_d = StWaitRd;
+        end
+      end
+
+      StWaitRd: begin
+        stall_host = 1'b1;
+
+        if (sram_d_ack) begin
+          state_d = StWriteCmd;
+        end
+      end
+
+      StWriteCmd: begin
+        stall_host = 1'b1;
+        wr_phase = 1'b1;
+
+        if (sram_a_ack) begin
+          state_d = StPassThru;
+        end
+      end
+
+      default:;
+
+    endcase // unique case (state_q)
+
+  end
+
+  // prim fifo for capturing info
+  typedef struct packed {
+    logic                  [2:0]  a_param;
+    logic  [top_pkg::TL_SZW-1:0]  a_size;
+    logic  [top_pkg::TL_AIW-1:0]  a_source;
+    logic   [top_pkg::TL_AW-1:0]  a_address;
+    logic  [top_pkg::TL_DBW-1:0]  a_mask;
+    logic   [top_pkg::TL_DW-1:0]  a_data;
+    tl_a_user_t                   a_user;
+  } tl_txn_data_t;
+
+  tl_txn_data_t txn_data;
+  tl_txn_data_t held_data;
+  logic fifo_rdy;
+  localparam int TxnDataWidth = $bits(tl_txn_data_t);
+
+  assign txn_data = '{
+    a_param: tl_i.a_param,
+    a_size: tl_i.a_size,
+    a_source: tl_i.a_source,
+    a_address: tl_i.a_address,
+    a_mask: tl_i.a_mask,
+    a_data: tl_i.a_data,
+    a_user: tl_i.a_user
+  };
+
+  prim_fifo_sync #(
+    .Width(TxnDataWidth),
+    .Pass(1'b0),
+    .Depth(1),
+    .OutputZeroIfEmpty(1'b0)
+  ) u_sync_fifo (
+    .clk_i,
+    .rst_ni,
+    .clr_i(1'b0),
+    .wvalid_i(byte_req_ack),
+    .wready_o(fifo_rdy),
+    .wdata_i(txn_data),
+    .rvalid_o(),
+    .rready_i(sram_a_ack),
+    .rdata_o(held_data),
+    .full_o(),
+    .depth_o()
+  );
+
+  // captured read data
+  logic [top_pkg::TL_DW-1:0] rsp_data;
+  always_ff @(posedge clk_i) begin
+    if (sram_d_ack) begin
+      rsp_data <= tl_sram_i.d_data;
+    end
+  end
+
+  // internally generated transactions
+  tl_h2d_t tl_h2d_int, tl_h2d_intg;
+
+  // while we could simply not assert a_ready to ensure the host keeps
+  // the request lines stable, there is no guarantee the hosts (if there are multiple)
+  // do not re-arbitrate on every cycle if its transactions are not accepted.
+  // As a result, it is better to capture the transaction attributes.
+  logic [top_pkg::TL_DW-1:0] combined_data;
+  always_comb begin
+    for (int i = 0; i < top_pkg::TL_DBW; i++) begin
+      combined_data[i*8 +: 8] = held_data.a_mask[i] ?
+                                held_data.a_data[i*8 +: 8] :
+                                rsp_data[i*8 +: 8];
+    end
+  end
+
+  assign tl_h2d_int = '{
+    // use incoming valid as long as we are not stalling the host
+    // otherwise look at whether there is a pending write.
+    a_valid:   (tl_i.a_valid & ~stall_host) | wr_phase,
+    a_opcode:  wr_phase ? PutFullData         : Get,
+    a_param:   wr_phase ? held_data.a_param   : tl_i.a_param,  // registered param
+    a_size:    wr_phase ? held_data.a_size    : tl_i.a_size,   // registered size
+    a_source:  wr_phase ? held_data.a_source  : tl_i.a_source, // registered source
+    a_address: wr_phase ? held_data.a_address : tl_i.a_address,// registered address
+    a_mask:    '{default: '1},
+    a_data:    wr_phase ? combined_data       : tl_i.a_data,   // registered data
+    a_user:    wr_phase ? held_data.a_user    : tl_i.a_user,   // registered user
+    d_ready:   1'b1
+  };
+
+  // outgoing tlul transactions
+  tlul_cmd_intg_gen #(
+    .EnableDataIntgGen(EnableIntg)
+  ) u_intg_gen (
+    .tl_i(tl_h2d_int),
+    .tl_o(tl_h2d_intg)
+  );
+
+  if (EnableIntg) begin : gen_intg_hookup
+    assign tl_sram_o = (sel_int == SelInt) ? tl_h2d_intg : tl_i;
+
+    always_comb begin
+      tl_o = tl_sram_i;
+
+      // pass a_ready through directly if we are not stalling
+      tl_o.a_ready = tl_sram_i.a_ready & ~stall_host & fifo_rdy;
+
+      // when internal logic has taken over, do not show response to host during
+      // read phase.  During write phase, allow the host to see the completion.
+      tl_o.d_valid = tl_sram_i.d_valid & ~stall_host;
+    end
+
+  end else begin : gen_tieoffs
+    logic unused_sigs;
+    assign unused_sigs = |tl_h2d_intg ^ |sel_int ^ fifo_rdy ^ byte_req_ack ^ wr_txn ^ error_i;
+    assign tl_sram_o = tl_i;
+    assign tl_o = tl_sram_i;
+  end
+
+  // byte access should trigger state change
+  if (EnableIntg) begin : gen_intg_asserts
+    // when byte access detected, go to wait read
+    `ASSERT(ByteAccessStateChange_A, a_ack & wr_txn & ~&tl_i.a_mask |=> state_q == StWaitRd)
+
+    // when in wait for read, a successful response should move to write phase
+    `ASSERT(ReadCompleteStateChange_A, (state_q == StWaitRd) & sram_d_ack |=> state_q == StWriteCmd)
+
+  end else begin : gen_non_intg_asserts
+    // state should never change, internal signal should never be selected
+    `ASSERT(StableSignals_A, (state_q == StPassThru) & sel_int == (SelPassThru))
+
+  end
+
+
+endmodule // tlul_adapter_sram
diff --git a/util/topgen/templates/toplevel.sv.tpl b/util/topgen/templates/toplevel.sv.tpl
index c7b5180..9a7d1a7 100644
--- a/util/topgen/templates/toplevel.sv.tpl
+++ b/util/topgen/templates/toplevel.sv.tpl
@@ -509,7 +509,7 @@
     .ErrOnWrite(1),
     .CmdIntgCheck(1),
     .EnableRspIntgGen(1),
-    .EnableDataIntgGen(1) // TODO: Needs to be updated for intgerity passthrough
+    .EnableDataIntgGen(0)
   ) u_tl_adapter_${m["name"]} (
     % for key in clocks:
     .${key}   (${clocks[key]}),