[prim_ram*_adv] Implement Byte parity in prim_ram*_adv

This implements a per-Byte parity option in prim_ram_2p_async_adv
and carries that change over to prim_ram_1p_adv / prim_ram_2p_adv.

A write mask signal is added to the interface, but it can only be used
if ECC is not enabled.

Signed-off-by: Michael Schaffner <msf@opentitan.org>
diff --git a/hw/ip/prim/rtl/prim_ram_1p_adv.sv b/hw/ip/prim/rtl/prim_ram_1p_adv.sv
index 6d54678..4452a18 100644
--- a/hw/ip/prim/rtl/prim_ram_1p_adv.sv
+++ b/hw/ip/prim/rtl/prim_ram_1p_adv.sv
@@ -3,63 +3,215 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 // Single-Port SRAM Wrapper
+//
+// Supported configurations:
+// - ECC for 32b wide memories with no write mask
+//   (Width == 32 && DataBitsPerMask == 32).
+// - Byte parity if Width is a multiple of 8 bit and write masks have Byte
+//   granularity (DataBitsPerMask == 8).
+//
+// Note that the write mask needs to be per Byte if parity is enabled. If ECC is enabled, the write
+// mask cannot be used and has to be tied to {Width{1'b1}}.
 
 `include "prim_assert.sv"
+`include "prim_util.svh"
 
 module prim_ram_1p_adv #(
-  // Parameters passed on the the SRAM primitive.
-  parameter  int Width           = 32, // bit
-  parameter  int Depth           = 128,
-  parameter  int DataBitsPerMask = 1, // Number of data bits per bit of write mask
-  parameter      MemInitFile     = "", // VMEM file to initialize the memory with
+  parameter  int Depth                = 512,
+  parameter  int Width                = 32,
+  parameter  int DataBitsPerMask      = 1,  // Number of data bits per bit of write mask
+  parameter  int CfgW                 = 8,  // WTC, RTC, etc
+  parameter      MemInitFile          = "", // VMEM file to initialize the memory with
 
-  parameter  int CfgW            = 8,     // WTC, RTC, etc
+  // Configurations
+  parameter  bit EnableECC            = 0, // Enables per-word ECC
+  parameter  bit EnableParity         = 0, // Enables per-Byte Parity
+  parameter  bit EnableInputPipeline  = 0, // Adds an input register (read latency +1)
+  parameter  bit EnableOutputPipeline = 0, // Adds an output register (read latency +1)
 
-  localparam int Aw              = $clog2(Depth)
+  localparam int Aw                   = vbits(Depth)
 ) (
-  input                     clk_i,
-  input                     rst_ni,
+  input clk_i,
+  input rst_ni,
 
-  input                     req_i,
-  input                     write_i,
-  input        [Aw-1:0]     addr_i,
-  input        [Width-1:0]  wdata_i,
-  input        [Width-1:0]  wmask_i,
-  output logic [Width-1:0]  rdata_o,
-  output logic              rvalid_o, // read response (rdata_o) is valid
-  output logic [1:0]        rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
+  input                      req_i,
+  input                      write_i,
+  input        [Aw-1:0]      addr_i,
+  input        [Width-1:0]   wdata_i,
+  input        [Width-1:0]   wmask_i,
+  output logic [Width-1:0]   rdata_o,
+  output logic               rvalid_o, // read response (rdata_o) is valid
+  output logic [1:0]         rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
 
-  input        [CfgW-1:0]   cfg_i
+  // config
+  input [CfgW-1:0] cfg_i
 );
 
-  // We will eventually use cfg_i for RTC/WTC or other memory parameters.
-  logic [CfgW-1:0] unused_cfg;
-  assign unused_cfg = cfg_i;
+  `ASSERT_INIT(CannotHaveEccAndParity_A, !(EnableParity && EnableECC))
+
+  // While we require DataBitsPerMask to be per Byte (8) at the interface in case Byte parity is
+  // enabled, we need to switch this to a per-bit mask locally such that we can individually enable
+  // the parity bits to be written alongside the data.
+  localparam int LocalDataBitsPerMask = (EnableParity) ? 1 : DataBitsPerMask;
+
+  // Calculate ECC width
+  localparam int ParWidth  = (EnableParity) ? Width/8 :
+                             (!EnableECC)   ? 0 :
+                             (Width <=   4) ? 4 :
+                             (Width <=  11) ? 5 :
+                             (Width <=  26) ? 6 :
+                             (Width <=  57) ? 7 :
+                             (Width <= 120) ? 8 : 8 ;
+  localparam int TotalWidth = Width + ParWidth;
+
+  ////////////////////////////
+  // RAM Primitive Instance //
+  ////////////////////////////
+
+  logic                    req_q,    req_d ;
+  logic                    write_q,  write_d ;
+  logic [Aw-1:0]           addr_q,   addr_d ;
+  logic [TotalWidth-1:0]   wdata_q,  wdata_d ;
+  logic [TotalWidth-1:0]   wmask_q,  wmask_d ;
+  logic                    rvalid_q, rvalid_d, rvalid_sram ;
+  logic [Width-1:0]        rdata_q,  rdata_d ;
+  logic [TotalWidth-1:0]   rdata_sram ;
+  logic [1:0]              rerror_q, rerror_d ;
 
   prim_ram_1p #(
-    .Width           (Width),
+    .MemInitFile     (MemInitFile),
+
+    .Width           (TotalWidth),
     .Depth           (Depth),
-    .DataBitsPerMask (DataBitsPerMask),
-    .MemInitFile     (MemInitFile)
+    .DataBitsPerMask (LocalDataBitsPerMask)
   ) u_mem (
     .clk_i,
 
-    .req_i,
-    .write_i,
-    .addr_i,
-    .wdata_i,
-    .wmask_i,
-    .rdata_o
+    .req_i    (req_q),
+    .write_i  (write_q),
+    .addr_i   (addr_q),
+    .wdata_i  (wdata_q),
+    .wmask_i  (wmask_q),
+    .rdata_o  (rdata_sram)
   );
 
-  always_ff @(posedge clk_i, negedge rst_ni) begin
+  always_ff @(posedge clk_i or negedge rst_ni) begin
     if (!rst_ni) begin
-      rvalid_o <= '0;
+      rvalid_sram <= 1'b0;
     end else begin
-      rvalid_o <= req_i & ~write_i;
+      rvalid_sram <= req_q & ~write_q;
     end
   end
 
-  assign rerror_o = 2'b0;
+  assign req_d              = req_i;
+  assign write_d            = write_i;
+  assign addr_d             = addr_i;
+  assign rvalid_o           = rvalid_q;
+  assign rdata_o            = rdata_q;
+  assign rerror_o           = rerror_q;
 
-endmodule
+  /////////////////////////////
+  // ECC / Parity Generation //
+  /////////////////////////////
+
+  if (EnableParity == 0 && EnableECC) begin : gen_secded
+
+    // check supported widths
+    `ASSERT_INIT(SecDecWidth_A, Width inside {32})
+
+    // the wmask is constantly set to 1 in this case
+    `ASSERT(OnlyWordWritePossibleWithEccPortA_A, req_i |->
+          wmask_i == {TotalWidth{1'b1}})
+
+    assign wmask_d = {TotalWidth{1'b1}};
+
+    if (Width == 32) begin : gen_secded_39_32
+      prim_secded_39_32_enc u_enc (.in(wdata_i), .out(wdata_d));
+      prim_secded_39_32_dec u_dec (
+        .in         (rdata_sram),
+        .d_o        (rdata_d[0+:Width]),
+        .syndrome_o ( ),
+        .err_o      (rerror_d)
+      );
+    end
+  end else if (EnableParity) begin : gen_byte_parity
+
+    `ASSERT_INIT(WidthNeedsToBeByteAligned_A, Width % 8 == 0)
+    `ASSERT_INIT(ParityNeedsByteWriteMask_A, DataBitsPerMask == 8)
+
+    always_comb begin : p_parity
+      rerror_d = '0;
+      wmask_d[0+:Width] = wmask_i;
+      wdata_d[0+:Width] = wdata_i;
+
+      for (int i = 0; i < Width/8; i ++) begin
+        // parity generation (odd parity)
+        wdata_d[Width + i] = ~(^wdata_i[i*8 +: 8]);
+        wmask_d[Width + i] = &wmask_i[i*8 +: 8];
+        // parity decoding (errors are always uncorrectable)
+        rerror_d[1] |= ~(^{rdata_sram[i*8 +: 8], rdata_sram[Width + i]});
+      end
+      // tie to zero if the read data is not valid
+      rerror_d &= {2{rvalid_sram}};
+    end
+
+    assign rdata_d  = rdata_sram[0+:Width];
+  end else begin : gen_nosecded_noparity
+    assign wmask_d = wmask_i;
+    assign wdata_d = wdata_i;
+
+    assign rdata_d  = rdata_sram[0+:Width];
+    assign rerror_d = '0;
+  end
+
+  assign rvalid_d = rvalid_sram;
+
+  /////////////////////////////////////
+  // Input/Output Pipeline Registers //
+  /////////////////////////////////////
+
+  if (EnableInputPipeline) begin : gen_regslice_input
+    // Put the register slices between ECC encoding to SRAM port
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        req_q   <= '0;
+        write_q <= '0;
+        addr_q  <= '0;
+        wdata_q <= '0;
+        wmask_q <= '0;
+      end else begin
+        req_q   <= req_d;
+        write_q <= write_d;
+        addr_q  <= addr_d;
+        wdata_q <= wdata_d;
+        wmask_q <= wmask_d;
+      end
+    end
+  end else begin : gen_dirconnect_input
+    assign req_q   = req_d;
+    assign write_q = write_d;
+    assign addr_q  = addr_d;
+    assign wdata_q = wdata_d;
+    assign wmask_q = wmask_d;
+  end
+
+  if (EnableOutputPipeline) begin : gen_regslice_output
+    // Put the register slices between ECC decoding to output
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        rvalid_q <= '0;
+        rdata_q  <= '0;
+        rerror_q <= '0;
+      end else begin
+        rvalid_q <= rvalid_d;
+        rdata_q  <= rdata_d;
+        rerror_q <= rerror_d;
+      end
+    end
+  end else begin : gen_dirconnect_output
+    assign rvalid_q = rvalid_d;
+    assign rdata_q  = rdata_d;
+    assign rerror_q = rerror_d;
+  end
+
+endmodule : prim_ram_1p_adv
diff --git a/hw/ip/prim/rtl/prim_ram_2p_adv.sv b/hw/ip/prim/rtl/prim_ram_2p_adv.sv
index 4fc1b72..2d9e5a8 100644
--- a/hw/ip/prim/rtl/prim_ram_2p_adv.sv
+++ b/hw/ip/prim/rtl/prim_ram_2p_adv.sv
@@ -2,271 +2,91 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
-// Dual-port SRAM Wrapper
-//   This module to connect SRAM interface to actual SRAM interface
-//   At this time, it doesn't utilize ECC or any pipeline.
-//   This module stays to include any additional calculation logic later on.
-//   Instantiating SRAM is up to the top design to remove process dependency.
-
-// Parameter
-//   EnableECC:
-//   EnableParity:
-//   EnableInputPipeline:
-//   EnableOutputPipeline:
+// Dual-Port SRAM Wrapper
+//
+// Supported configurations:
+// - ECC for 32b wide memories with no write mask
+//   (Width == 32 && DataBitsPerMask == 32).
+// - Byte parity if Width is a multiple of 8 bit and write masks have Byte
+//   granularity (DataBitsPerMask == 8).
+//
+// Note that the write mask needs to be per Byte if parity is enabled. If ECC is enabled, the write
+// mask cannot be used and has to be tied to {Width{1'b1}}.
 
 `include "prim_assert.sv"
+`include "prim_util.svh"
 
 module prim_ram_2p_adv #(
   parameter  int Depth                = 512,
   parameter  int Width                = 32,
-  parameter  int CfgW                 = 8, // WTC, RTC, etc
+  parameter  int DataBitsPerMask      = 1,  // Number of data bits per bit of write mask
+  parameter  int CfgW                 = 8,  // WTC, RTC, etc
   parameter      MemInitFile          = "", // VMEM file to initialize the memory with
 
   // Configurations
-  parameter  bit EnableECC            = 0,
-  parameter  bit EnableParity         = 0,
-  parameter  bit EnableInputPipeline  = 0,
-  parameter  bit EnableOutputPipeline = 0,
+  parameter  bit EnableECC            = 0, // Enables per-word ECC
+  parameter  bit EnableParity         = 0, // Enables per-Byte Parity
+  parameter  bit EnableInputPipeline  = 0, // Adds an input register (read latency +1)
+  parameter  bit EnableOutputPipeline = 0, // Adds an output register (read latency +1)
 
-  parameter      MemT                 = "REGISTER", // can be "REGISTER" or "SRAM"
-
-  localparam int Aw                   = $clog2(Depth)
+  localparam int Aw                   = vbits(Depth)
 ) (
-  input                     clk_i,
-  input                     rst_ni,
+  input                    clk_i,
+  input                    rst_ni,
 
-  input                     a_req_i,
-  input                     a_write_i,
-  input        [Aw-1:0]     a_addr_i,
-  input        [Width-1:0]  a_wdata_i,
-  output logic [Width-1:0]  a_rdata_o,
-  output logic              a_rvalid_o, // read response (a_rdata_o) is valid
-  output logic [1:0]        a_rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
+  input                    a_req_i,
+  input                    a_write_i,
+  input        [Aw-1:0]    a_addr_i,
+  input        [Width-1:0] a_wdata_i,
+  input        [Width-1:0] a_wmask_i,  // cannot be used with ECC, tie to 1 in that case
+  output logic [Width-1:0] a_rdata_o,
+  output logic             a_rvalid_o, // read response (a_rdata_o) is valid
+  output logic [1:0]       a_rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
 
-  input                     b_req_i,
-  input                     b_write_i,
-  input        [Aw-1:0]     b_addr_i,
-  input        [Width-1:0]  b_wdata_i,
-  output logic [Width-1:0]  b_rdata_o,
-  output logic              b_rvalid_o, // read response (b_rdata_o) is valid
-  output logic [1:0]        b_rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
+  input                    b_req_i,
+  input                    b_write_i,
+  input        [Aw-1:0]    b_addr_i,
+  input        [Width-1:0] b_wdata_i,
+  input        [Width-1:0] b_wmask_i,  // cannot be used with ECC, tie to 1 in that case
+  output logic [Width-1:0] b_rdata_o,
+  output logic             b_rvalid_o, // read response (b_rdata_o) is valid
+  output logic [1:0]       b_rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
 
-  input        [CfgW-1:0]   cfg_i
+  input        [CfgW-1:0]  cfg_i
 );
 
-  // Calculate ECC width
-  localparam int ParWidth  = (EnableParity) ? 1 :
-                             (!EnableECC)   ? 0 :
-                             (Width <=   4) ? 4 :
-                             (Width <=  11) ? 5 :
-                             (Width <=  26) ? 6 :
-                             (Width <=  57) ? 7 :
-                             (Width <= 120) ? 8 : 8 ;
-  localparam int TotalWidth = Width + ParWidth;
+  prim_ram_2p_async_adv #(
+    .Depth               (Depth),
+    .Width               (Width),
+    .DataBitsPerMask     (DataBitsPerMask),
+    .CfgW                (CfgW),
+    .MemInitFile         (MemInitFile),
+    .EnableECC           (EnableECC),
+    .EnableParity        (EnableParity),
+    .EnableInputPipeline (EnableInputPipeline),
+    .EnableOutputPipeline(EnableOutputPipeline)
+  ) i_prim_ram_2p_async_adv (
+    .clk_a_i(clk_i),
+    .rst_a_ni(rst_ni),
+    .clk_b_i(clk_i),
+    .rst_b_ni(rst_ni),
+    .a_req_i,
+    .a_write_i,
+    .a_addr_i,
+    .a_wdata_i,
+    .a_wmask_i,
+    .a_rdata_o,
+    .a_rvalid_o,
+    .a_rerror_o,
+    .b_req_i,
+    .b_write_i,
+    .b_addr_i,
+    .b_wdata_i,
+    .b_wmask_i,
+    .b_rdata_o,
+    .b_rvalid_o,
+    .b_rerror_o,
+    .cfg_i
+  );
 
-  // We will eventually use cfg_i for RTC/WTC or other memory parameters.
-  logic [CfgW-1:0] unused_cfg;
-  assign unused_cfg = cfg_i;
-
-  logic                  a_req_q,    a_req_d ;
-  logic                  a_write_q,  a_write_d ;
-  logic [Aw-1:0]         a_addr_q,   a_addr_d ;
-  logic [TotalWidth-1:0] a_wdata_q,  a_wdata_d ;
-  logic                  a_rvalid_q, a_rvalid_d, a_rvalid_sram ;
-  logic [Width-1:0]      a_rdata_q,  a_rdata_d ;
-  logic [TotalWidth-1:0] a_rdata_sram ;
-  logic [1:0]            a_rerror_q, a_rerror_d ;
-
-  logic                  b_req_q,    b_req_d ;
-  logic                  b_write_q,  b_write_d ;
-  logic [Aw-1:0]         b_addr_q,   b_addr_d ;
-  logic [TotalWidth-1:0] b_wdata_q,  b_wdata_d ;
-  logic                  b_rvalid_q, b_rvalid_d, b_rvalid_sram ;
-  logic [Width-1:0]      b_rdata_q,  b_rdata_d ;
-  logic [TotalWidth-1:0] b_rdata_sram ;
-  logic [1:0]            b_rerror_q, b_rerror_d ;
-
-  if (MemT == "REGISTER") begin : gen_regmem
-    prim_ram_2p #(
-      // force register implementation for all targets
-      .Impl            (prim_pkg::ImplGeneric),
-
-      .Width           (TotalWidth),
-      .Depth           (Depth),
-      .DataBitsPerMask (TotalWidth),
-      .MemInitFile     (MemInitFile)
-    ) u_mem (
-      .clk_a_i    (clk_i),
-      .clk_b_i    (clk_i),
-
-      .a_req_i    (a_req_q),
-      .a_write_i  (a_write_q),
-      .a_addr_i   (a_addr_q),
-      .a_wdata_i  (a_wdata_q),
-      .a_wmask_i  ({TotalWidth{1'b1}}),
-      .a_rdata_o  (a_rdata_sram),
-
-      .b_req_i    (b_req_q),
-      .b_write_i  (b_write_q),
-      .b_addr_i   (b_addr_q),
-      .b_wdata_i  (b_wdata_q),
-      .b_wmask_i  ({TotalWidth{1'b1}}),
-      .b_rdata_o  (b_rdata_sram)
-    );
-  // end else if (TotalWidth == aa && Depth == yy) begin
-  end else if (MemT == "SRAM") begin : gen_srammem
-    prim_ram_2p #(
-      .Width           (TotalWidth),
-      .Depth           (Depth),
-      .DataBitsPerMask (TotalWidth),
-      .MemInitFile     (MemInitFile)
-    ) u_mem (
-      .clk_a_i    (clk_i),
-      .clk_b_i    (clk_i),
-
-      .a_req_i    (a_req_q),
-      .a_write_i  (a_write_q),
-      .a_addr_i   (a_addr_q),
-      .a_wdata_i  (a_wdata_q),
-      .a_wmask_i  ({TotalWidth{1'b1}}),
-      .a_rdata_o  (a_rdata_sram),
-
-      .b_req_i    (b_req_q),
-      .b_write_i  (b_write_q),
-      .b_addr_i   (b_addr_q),
-      .b_wdata_i  (b_wdata_q),
-      .b_wmask_i  ({TotalWidth{1'b1}}),
-      .b_rdata_o  (b_rdata_sram)
-    );
-  end
-
- always_ff @(posedge clk_i, negedge rst_ni) begin
-    if (!rst_ni) begin
-       a_rvalid_sram <= '0;
-       b_rvalid_sram <= '0;
-    end else begin
-      a_rvalid_sram <= a_req_q & ~a_write_q;
-      b_rvalid_sram <= b_req_q & ~b_write_q;
-    end
-  end
-
-  assign a_req_d              = a_req_i;
-  assign a_write_d            = a_write_i;
-  assign a_addr_d             = a_addr_i;
-  assign a_rvalid_o           = a_rvalid_q;
-  assign a_rdata_o            = a_rdata_q;
-  assign a_rerror_o           = a_rerror_q;
-
-  assign b_req_d              = b_req_i;
-  assign b_write_d            = b_write_i;
-  assign b_addr_d             = b_addr_i;
-  assign b_rvalid_o           = b_rvalid_q;
-  assign b_rdata_o            = b_rdata_q;
-  assign b_rerror_o           = b_rerror_q;
-
-  // TODO: Parity Logic
-  `ASSERT_INIT(ParityNotYetSupported_A, EnableParity == 0)
-
-  if (EnableParity == 0 && EnableECC) begin : gen_secded
-
-    // check supported widths
-    `ASSERT_INIT(SecDecWidth_A, Width inside {32})
-
-    if (Width == 32) begin : gen_secded_39_32
-      prim_secded_39_32_enc u_enc_a (.in(a_wdata_i), .out(a_wdata_d));
-      prim_secded_39_32_dec u_dec_a (
-        .in         (a_rdata_sram),
-        .d_o        (a_rdata_d),
-        .syndrome_o (),
-        .err_o      (a_rerror_d)
-      );
-      prim_secded_39_32_enc u_enc_b (.in(b_wdata_i), .out(b_wdata_d));
-      prim_secded_39_32_dec u_dec_b (
-        .in         (b_rdata_sram),
-        .d_o        (b_rdata_d),
-        .syndrome_o (),
-        .err_o      (b_rerror_d)
-      );
-      assign a_rvalid_d = a_rvalid_sram;
-      assign b_rvalid_d = b_rvalid_sram;
-    end
-  end else begin : gen_nosecded
-    assign a_wdata_d[0+:Width] = a_wdata_i;
-    assign b_wdata_d[0+:Width] = b_wdata_i;
-    assign a_rdata_d  = a_rdata_sram;
-    assign b_rdata_d  = b_rdata_sram;
-    assign a_rvalid_d = a_rvalid_sram;
-    assign b_rvalid_d = b_rvalid_sram;
-    assign a_rerror_d = 2'b00;
-    assign b_rerror_d = 2'b00;
-  end
-
-  if (EnableInputPipeline) begin : gen_regslice_input
-    // Put the register slices between ECC encoding to SRAM port
-    always_ff @(posedge clk_i or negedge rst_ni) begin
-      if (!rst_ni) begin
-        a_req_q   <= '0;
-        a_write_q <= '0;
-        a_addr_q  <= '0;
-        a_wdata_q <= '0;
-
-        b_req_q   <= '0;
-        b_write_q <= '0;
-        b_addr_q  <= '0;
-        b_wdata_q <= '0;
-      end else begin
-        a_req_q   <= a_req_d;
-        a_write_q <= a_write_d;
-        a_addr_q  <= a_addr_d;
-        a_wdata_q <= a_wdata_d;
-
-        b_req_q   <= b_req_d;
-        b_write_q <= b_write_d;
-        b_addr_q  <= b_addr_d;
-        b_wdata_q <= b_wdata_d;
-      end
-    end
-  end else begin : gen_dirconnect_input
-    assign a_req_q   = a_req_d;
-    assign a_write_q = a_write_d;
-    assign a_addr_q  = a_addr_d;
-    assign a_wdata_q = a_wdata_d;
-
-    assign b_req_q   = b_req_d;
-    assign b_write_q = b_write_d;
-    assign b_addr_q  = b_addr_d;
-    assign b_wdata_q = b_wdata_d;
-  end
-
-  if (EnableOutputPipeline) begin : gen_regslice_output
-    // Put the register slices between ECC decoding to output
-    always_ff @(posedge clk_i or negedge rst_ni) begin
-      if (!rst_ni) begin
-        a_rvalid_q <= '0;
-        a_rdata_q  <= '0;
-        a_rerror_q <= '0;
-
-        b_rvalid_q <= '0;
-        b_rdata_q  <= '0;
-        b_rerror_q <= '0;
-      end else begin
-        a_rvalid_q <= a_rvalid_d;
-        a_rdata_q  <= a_rdata_d ;
-        a_rerror_q <= a_rerror_d;
-
-        b_rvalid_q <= b_rvalid_d;
-        b_rdata_q  <= b_rdata_d ;
-        b_rerror_q <= b_rerror_d;
-      end
-    end
-  end else begin : gen_dirconnect_output
-    assign a_rvalid_q = a_rvalid_d;
-    assign a_rdata_q  = a_rdata_d;
-    assign a_rerror_q = a_rerror_d;
-
-    assign b_rvalid_q = b_rvalid_d;
-    assign b_rdata_q  = b_rdata_d;
-    assign b_rerror_q = b_rerror_d;
-  end
-
-endmodule
+endmodule : prim_ram_2p_adv
diff --git a/hw/ip/prim/rtl/prim_ram_2p_async_adv.sv b/hw/ip/prim/rtl/prim_ram_2p_async_adv.sv
index a1c8474..ab0cd08 100644
--- a/hw/ip/prim/rtl/prim_ram_2p_async_adv.sv
+++ b/hw/ip/prim/rtl/prim_ram_2p_async_adv.sv
@@ -2,63 +2,71 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
-// Dual-port SRAM Wrapper
-//   This module to connect SRAM interface to actual SRAM interface
-//   At this time, it doesn't utilize ECC or any pipeline.
-//   This module stays to include any additional calculation logic later on.
-//   Instantiating SRAM is up to the top design to remove process dependency.
-
-// Parameter
-//   EnableECC:
-//   EnableParity:
-//   EnableInputPipeline:
-//   EnableOutputPipeline:
+// Asynchronous Dual-Port SRAM Wrapper
+//
+// Supported configurations:
+// - ECC for 32b wide memories with no write mask
+//   (Width == 32 && DataBitsPerMask == 32).
+// - Byte parity if Width is a multiple of 8 bit and write masks have Byte
+//   granularity (DataBitsPerMask == 8).
+//
+// Note that the write mask needs to be per Byte if parity is enabled. If ECC is enabled, the write
+// mask cannot be used and has to be tied to {Width{1'b1}}.
 
 `include "prim_assert.sv"
+`include "prim_util.svh"
 
 module prim_ram_2p_async_adv #(
-  parameter int Depth = 512,
-  parameter int Width = 32,
-  parameter int CfgW = 8,     // WTC, RTC, etc
+  parameter  int Depth                = 512,
+  parameter  int Width                = 32,
+  parameter  int DataBitsPerMask      = 1,  // Number of data bits per bit of write mask
+  parameter  int CfgW                 = 8,  // WTC, RTC, etc
+  parameter      MemInitFile          = "", // VMEM file to initialize the memory with
 
   // Configurations
-  parameter bit EnableECC            = 0,
-  parameter bit EnableParity         = 0,
-  parameter bit EnableInputPipeline  = 0,
-  parameter bit EnableOutputPipeline = 0,
+  parameter  bit EnableECC            = 0, // Enables per-word ECC
+  parameter  bit EnableParity         = 0, // Enables per-Byte Parity
+  parameter  bit EnableInputPipeline  = 0, // Adds an input register (read latency +1)
+  parameter  bit EnableOutputPipeline = 0, // Adds an output register (read latency +1)
 
-  parameter MemT = "REGISTER", // can be "REGISTER" or "SRAM"
-
-  // Do not touch
-  parameter int SramAw = $clog2(Depth)
+  localparam int Aw                   = vbits(Depth)
 ) (
   input clk_a_i,
   input clk_b_i,
   input rst_a_ni,
   input rst_b_ni,
 
-  input                     a_req_i,
-  input                     a_write_i,
-  input        [SramAw-1:0] a_addr_i,
-  input        [Width-1:0]  a_wdata_i,
-  output logic              a_rvalid_o,
-  output logic [Width-1:0]  a_rdata_o,
-  output logic [1:0]        a_rerror_o,
+  input                    a_req_i,
+  input                    a_write_i,
+  input        [Aw-1:0]    a_addr_i,
+  input        [Width-1:0] a_wdata_i,
+  input        [Width-1:0] a_wmask_i,  // cannot be used with ECC, tie to 1 in that case
+  output logic [Width-1:0] a_rdata_o,
+  output logic             a_rvalid_o, // read response (a_rdata_o) is valid
+  output logic [1:0]       a_rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
 
-  input                     b_req_i,
-  input                     b_write_i,
-  input        [SramAw-1:0] b_addr_i,
-  input        [Width-1:0]  b_wdata_i,
-  output logic              b_rvalid_o,
-  output logic [Width-1:0]  b_rdata_o,
-  output logic [1:0]        b_rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
+  input                    b_req_i,
+  input                    b_write_i,
+  input        [Aw-1:0]    b_addr_i,
+  input        [Width-1:0] b_wdata_i,
+  input        [Width-1:0] b_wmask_i,  // cannot be used with ECC, tie to 1 in that case
+  output logic [Width-1:0] b_rdata_o,
+  output logic             b_rvalid_o, // read response (b_rdata_o) is valid
+  output logic [1:0]       b_rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
 
   // config
   input [CfgW-1:0] cfg_i
 );
 
+  `ASSERT_INIT(CannotHaveEccAndParity_A, !(EnableParity && EnableECC))
+
+  // While we require DataBitsPerMask to be per Byte (8) at the interface in case Byte parity is
+  // enabled, we need to switch this to a per-bit mask locally such that we can individually enable
+  // the parity bits to be written alongside the data.
+  localparam int LocalDataBitsPerMask = (EnableParity) ? 1 : DataBitsPerMask;
+
   // Calculate ECC width
-  localparam int ParWidth  = (EnableParity) ? 1 :
+  localparam int ParWidth  = (EnableParity) ? Width/8 :
                              (!EnableECC)   ? 0 :
                              (Width <=   4) ? 4 :
                              (Width <=  11) ? 5 :
@@ -67,75 +75,54 @@
                              (Width <= 120) ? 8 : 8 ;
   localparam int TotalWidth = Width + ParWidth;
 
-  logic                  a_req_q,    a_req_d ;
-  logic                  a_write_q,  a_write_d ;
-  logic [SramAw-1:0]     a_addr_q,   a_addr_d ;
-  logic [TotalWidth-1:0] a_wdata_q,  a_wdata_d ;
-  logic                  a_rvalid_q, a_rvalid_d, a_rvalid_sram ;
-  logic [TotalWidth-1:0] a_rdata_d, a_rdata_sram ;
-  logic [Width-1:0]      a_rdata_q ;
-  logic [1:0]            a_rerror_q, a_rerror_d ;
+  ////////////////////////////
+  // RAM Primitive Instance //
+  ////////////////////////////
 
-  logic                  b_req_q,    b_req_d ;
-  logic                  b_write_q,  b_write_d ;
-  logic [SramAw-1:0]     b_addr_q,   b_addr_d ;
-  logic [TotalWidth-1:0] b_wdata_q,  b_wdata_d ;
-  logic                  b_rvalid_q, b_rvalid_d, b_rvalid_sram ;
-  logic [TotalWidth-1:0] b_rdata_d, b_rdata_sram ;
-  logic [Width-1:0]      b_rdata_q ;
-  logic [1:0]            b_rerror_q, b_rerror_d ;
+  logic                    a_req_q,    a_req_d ;
+  logic                    a_write_q,  a_write_d ;
+  logic [Aw-1:0]           a_addr_q,   a_addr_d ;
+  logic [TotalWidth-1:0]   a_wdata_q,  a_wdata_d ;
+  logic [TotalWidth-1:0]   a_wmask_q,  a_wmask_d ;
+  logic                    a_rvalid_q, a_rvalid_d, a_rvalid_sram ;
+  logic [Width-1:0]        a_rdata_q,  a_rdata_d ;
+  logic [TotalWidth-1:0]   a_rdata_sram ;
+  logic [1:0]              a_rerror_q, a_rerror_d ;
 
-  if (MemT == "REGISTER") begin : gen_regmem
-    prim_ram_2p #(
-      // force register implementation for all targets
-      .Impl            (prim_pkg::ImplGeneric),
+  logic                    b_req_q,    b_req_d ;
+  logic                    b_write_q,  b_write_d ;
+  logic [Aw-1:0]           b_addr_q,   b_addr_d ;
+  logic [TotalWidth-1:0]   b_wdata_q,  b_wdata_d ;
+  logic [TotalWidth-1:0]   b_wmask_q,  b_wmask_d ;
+  logic                    b_rvalid_q, b_rvalid_d, b_rvalid_sram ;
+  logic [Width-1:0]        b_rdata_q,  b_rdata_d ;
+  logic [TotalWidth-1:0]   b_rdata_sram ;
+  logic [1:0]              b_rerror_q, b_rerror_d ;
 
-      .Width           (TotalWidth),
-      .Depth           (Depth),
-      .DataBitsPerMask (TotalWidth)
-    ) u_mem (
-      .clk_a_i    (clk_a_i),
-      .clk_b_i    (clk_b_i),
+  prim_ram_2p #(
+    .MemInitFile     (MemInitFile),
 
-      .a_req_i    (a_req_q),
-      .a_write_i  (a_write_q),
-      .a_addr_i   (a_addr_q),
-      .a_wdata_i  (a_wdata_q),
-      .a_wmask_i  ({TotalWidth{1'b1}}),
-      .a_rdata_o  (a_rdata_sram),
+    .Width           (TotalWidth),
+    .Depth           (Depth),
+    .DataBitsPerMask (LocalDataBitsPerMask)
+  ) u_mem (
+    .clk_a_i    (clk_a_i),
+    .clk_b_i    (clk_b_i),
 
-      .b_req_i    (b_req_q),
-      .b_write_i  (b_write_q),
-      .b_addr_i   (b_addr_q),
-      .b_wdata_i  (b_wdata_q),
-      .b_wmask_i  ({TotalWidth{1'b1}}),
-      .b_rdata_o  (b_rdata_sram)
-    );
-  // end else if (TotalWidth == aa && Depth == yy) begin
-  end else if (MemT == "SRAM") begin : gen_srammem
-    prim_ram_2p #(
-      .Width           (TotalWidth),
-      .Depth           (Depth),
-      .DataBitsPerMask (TotalWidth)
-    ) u_mem (
-      .clk_a_i    (clk_a_i),
-      .clk_b_i    (clk_b_i),
+    .a_req_i    (a_req_q),
+    .a_write_i  (a_write_q),
+    .a_addr_i   (a_addr_q),
+    .a_wdata_i  (a_wdata_q),
+    .a_wmask_i  (a_wmask_q),
+    .a_rdata_o  (a_rdata_sram),
 
-      .a_req_i    (a_req_q),
-      .a_write_i  (a_write_q),
-      .a_addr_i   (a_addr_q),
-      .a_wdata_i  (a_wdata_q),
-      .a_wmask_i  ({TotalWidth{1'b1}}),
-      .a_rdata_o  (a_rdata_sram),
-
-      .b_req_i    (b_req_q),
-      .b_write_i  (b_write_q),
-      .b_addr_i   (b_addr_q),
-      .b_wdata_i  (b_wdata_q),
-      .b_wmask_i  ({TotalWidth{1'b1}}),
-      .b_rdata_o  (b_rdata_sram)
-    );
-  end
+    .b_req_i    (b_req_q),
+    .b_write_i  (b_write_q),
+    .b_addr_i   (b_addr_q),
+    .b_wdata_i  (b_wdata_q),
+    .b_wmask_i  (b_wmask_q),
+    .b_rdata_o  (b_rdata_sram)
+  );
 
   always_ff @(posedge clk_a_i or negedge rst_a_ni) begin
     if (!rst_a_ni) begin
@@ -166,43 +153,88 @@
   assign b_rdata_o            = b_rdata_q;
   assign b_rerror_o           = b_rerror_q;
 
-  // TODO: Parity Logic
-  `ASSERT_INIT(ParityNotYetSupported_A, EnableParity == 0)
+  /////////////////////////////
+  // ECC / Parity Generation //
+  /////////////////////////////
 
   if (EnableParity == 0 && EnableECC) begin : gen_secded
 
     // check supported widths
     `ASSERT_INIT(SecDecWidth_A, Width inside {32})
 
+    // the wmask is constantly set to 1 in this case
+    `ASSERT(OnlyWordWritePossibleWithEccPortA_A, a_req_i |->
+        a_wmask_i == {TotalWidth{1'b1}}, clk_a_i, rst_a_ni)
+    `ASSERT(OnlyWordWritePossibleWithEccPortB_A, b_req_i |->
+        b_wmask_i == {TotalWidth{1'b1}}, clk_b_i, rst_b_ni)
+
+    assign a_wmask_d = {TotalWidth{1'b1}};
+    assign b_wmask_d = {TotalWidth{1'b1}};
+
     if (Width == 32) begin : gen_secded_39_32
       prim_secded_39_32_enc u_enc_a (.in(a_wdata_i), .out(a_wdata_d));
       prim_secded_39_32_dec u_dec_a (
         .in         (a_rdata_sram),
         .d_o        (a_rdata_d[0+:Width]),
-        .syndrome_o (a_rdata_d[Width+:ParWidth]),
+        .syndrome_o ( ),
         .err_o      (a_rerror_d)
       );
       prim_secded_39_32_enc u_enc_b (.in(b_wdata_i), .out(b_wdata_d));
       prim_secded_39_32_dec u_dec_b (
         .in         (b_rdata_sram),
         .d_o        (b_rdata_d[0+:Width]),
-        .syndrome_o (b_rdata_d[Width+:ParWidth]),
+        .syndrome_o ( ),
         .err_o      (b_rerror_d)
       );
-      assign a_rvalid_d = a_rvalid_sram;
-      assign b_rvalid_d = b_rvalid_sram;
     end
-  end else begin : gen_nosecded
-    assign a_wdata_d[0+:Width] = a_wdata_i;
-    assign b_wdata_d[0+:Width] = b_wdata_i;
-    assign a_rdata_d[0+:Width] = a_rdata_sram;
-    assign b_rdata_d[0+:Width] = b_rdata_sram;
-    assign a_rvalid_d = a_rvalid_sram;
-    assign b_rvalid_d = b_rvalid_sram;
-    assign a_rerror_d = 2'b00;
-    assign b_rerror_d = 2'b00;
+  end else if (EnableParity) begin : gen_byte_parity
+
+    `ASSERT_INIT(ParityNeedsByteWriteMask_A, DataBitsPerMask == 8)
+    `ASSERT_INIT(WidthNeedsToBeByteAligned_A, Width % 8 == 0)
+
+    always_comb begin : p_parity
+      a_rerror_d = '0;
+      b_rerror_d = '0;
+      a_wmask_d[0+:Width] = a_wmask_i;
+      b_wmask_d[0+:Width] = b_wmask_i;
+      a_wdata_d[0+:Width] = a_wdata_i;
+      b_wdata_d[0+:Width] = b_wdata_i;
+
+      for (int i = 0; i < Width/8; i ++) begin
+        // parity generation (odd parity)
+        a_wdata_d[Width + i] = ~(^a_wdata_i[i*8 +: 8]);
+        b_wdata_d[Width + i] = ~(^b_wdata_i[i*8 +: 8]);
+        a_wmask_d[Width + i] = &a_wmask_i[i*8 +: 8];
+        b_wmask_d[Width + i] = &b_wmask_i[i*8 +: 8];
+        // parity decoding (errors are always uncorrectable)
+        a_rerror_d[1] |= ~(^{a_rdata_sram[i*8 +: 8], a_rdata_sram[Width + i]});
+        b_rerror_d[1] |= ~(^{b_rdata_sram[i*8 +: 8], b_rdata_sram[Width + i]});
+      end
+      // tie to zero if the read data is not valid
+      a_rerror_d &= {2{a_rvalid_sram}};
+      b_rerror_d &= {2{b_rvalid_sram}};
+    end
+
+    assign a_rdata_d  = a_rdata_sram[0+:Width];
+    assign b_rdata_d  = b_rdata_sram[0+:Width];
+  end else begin : gen_nosecded_noparity
+    assign a_wmask_d  = a_wmask_i;
+    assign b_wmask_d  = b_wmask_i;
+    assign a_wdata_d  = a_wdata_i;
+    assign b_wdata_d  = b_wdata_i;
+    assign a_rdata_d  = a_rdata_sram[0+:Width];
+    assign b_rdata_d  = b_rdata_sram[0+:Width];
+    assign a_rerror_d = '0;
+    assign b_rerror_d = '0;
   end
 
+  assign a_rvalid_d = a_rvalid_sram;
+  assign b_rvalid_d = b_rvalid_sram;
+
+  /////////////////////////////////////
+  // Input/Output Pipeline Registers //
+  /////////////////////////////////////
+
   if (EnableInputPipeline) begin : gen_regslice_input
     // Put the register slices between ECC encoding to SRAM port
     always_ff @(posedge clk_a_i or negedge rst_a_ni) begin
@@ -211,11 +243,13 @@
         a_write_q <= '0;
         a_addr_q  <= '0;
         a_wdata_q <= '0;
+        a_wmask_q <= '0;
       end else begin
         a_req_q   <= a_req_d;
         a_write_q <= a_write_d;
         a_addr_q  <= a_addr_d;
         a_wdata_q <= a_wdata_d;
+        a_wmask_q <= a_wmask_d;
       end
     end
     always_ff @(posedge clk_b_i or negedge rst_b_ni) begin
@@ -224,11 +258,13 @@
         b_write_q <= '0;
         b_addr_q  <= '0;
         b_wdata_q <= '0;
+        b_wmask_q <= '0;
       end else begin
         b_req_q   <= b_req_d;
         b_write_q <= b_write_d;
         b_addr_q  <= b_addr_d;
         b_wdata_q <= b_wdata_d;
+        b_wmask_q <= b_wmask_d;
       end
     end
   end else begin : gen_dirconnect_input
@@ -236,11 +272,13 @@
     assign a_write_q = a_write_d;
     assign a_addr_q  = a_addr_d;
     assign a_wdata_q = a_wdata_d;
+    assign a_wmask_q = a_wmask_d;
 
     assign b_req_q   = b_req_d;
     assign b_write_q = b_write_d;
     assign b_addr_q  = b_addr_d;
     assign b_wdata_q = b_wdata_d;
+    assign b_wmask_q = b_wmask_d;
   end
 
   if (EnableOutputPipeline) begin : gen_regslice_output
@@ -252,7 +290,7 @@
         a_rerror_q <= '0;
       end else begin
         a_rvalid_q <= a_rvalid_d;
-        a_rdata_q  <= a_rdata_d[0+:Width] ;
+        a_rdata_q  <= a_rdata_d;
         a_rerror_q <= a_rerror_d;
       end
     end
@@ -263,18 +301,18 @@
         b_rerror_q <= '0;
       end else begin
         b_rvalid_q <= b_rvalid_d;
-        b_rdata_q  <= b_rdata_d[0+:Width] ;
+        b_rdata_q  <= b_rdata_d;
         b_rerror_q <= b_rerror_d;
       end
     end
   end else begin : gen_dirconnect_output
     assign a_rvalid_q = a_rvalid_d;
-    assign a_rdata_q  = a_rdata_d[0+:Width];
+    assign a_rdata_q  = a_rdata_d;
     assign a_rerror_q = a_rerror_d;
 
     assign b_rvalid_q = b_rvalid_d;
-    assign b_rdata_q  = b_rdata_d[0+:Width];
+    assign b_rdata_q  = b_rdata_d;
     assign b_rerror_q = b_rerror_d;
   end
 
-endmodule
+endmodule : prim_ram_2p_async_adv