|  | // Copyright lowRISC contributors. | 
|  | // Licensed under the Apache License, Version 2.0, see LICENSE for details. | 
|  | // SPDX-License-Identifier: Apache-2.0 | 
|  | // | 
|  | // This is a draft implementation of a low-latency memory scrambling mechanism. | 
|  | // | 
|  | // The module is implemented as a primitive, in the same spirit as similar prim_ram_1p_adv wrappers. | 
|  | // Hence, it can be conveniently instantiated by comportable IPs (such as OTBN) or in top_earlgrey | 
|  | // for the main system memory. | 
|  | // | 
|  | // The currently implemented architecture uses a reduced-round PRINCE cipher primitive in CTR mode | 
|  | // in order to (weakly) scramble the data written to the memory macro. Plain CTR mode does not | 
|  | // diffuse the data since the keystream is just XOR'ed onto it, hence we also we perform byte-wise | 
|  | // diffusion using a (shallow) substitution/permutation network layers in order to provide a limited | 
|  | // avalanche effect within a byte. | 
|  | // | 
|  | // In order to break the linear addressing space, the address is passed through a bijective | 
|  | // scrambling function constructed using a (shallow) substitution/permutation and a nonce. Due to | 
|  | // that nonce, the address mapping is not fully baked into RTL and can be changed at runtime as | 
|  | // well. | 
|  | // | 
|  | // See also: prim_cipher_pkg, prim_prince | 
|  |  | 
|  | `include "prim_assert.sv" | 
|  |  | 
|  | module prim_ram_1p_scr import prim_ram_1p_pkg::*; #( | 
|  | parameter  int Depth               = 16*1024, // Needs to be a power of 2 if NumAddrScrRounds > 0. | 
|  | parameter  int Width               = 32, // Needs to be byte aligned if byte parity is enabled. | 
|  | parameter  int DataBitsPerMask     = 8, // Needs to be set to 8 in case of byte parity. | 
|  | parameter  bit EnableParity        = 1, // Enable byte parity. | 
|  |  | 
|  | // Scrambling parameters. Note that this needs to be low-latency, hence we have to keep the | 
|  | // amount of cipher rounds low. PRINCE has 5 half rounds in its original form, which corresponds | 
|  | // to 2*5 + 1 effective rounds. Setting this to 2 halves this to approximately 5 effective rounds. | 
|  | // Number of PRINCE half rounds, can be [1..5] | 
|  | parameter  int NumPrinceRoundsHalf = 2, | 
|  | // Number of extra diffusion rounds. Setting this to 0 to disable diffusion. | 
|  | parameter  int NumDiffRounds       = 2, | 
|  | // This parameter governs the block-width of additional diffusion layers. | 
|  | // For intra-byte diffusion, set this parameter to 8. | 
|  | parameter  int DiffWidth           = DataBitsPerMask, | 
|  | // Number of address scrambling rounds. Setting this to 0 disables address scrambling. | 
|  | parameter  int NumAddrScrRounds    = 2, | 
|  | // If set to 1, the same 64bit key stream is replicated if the data port is wider than 64bit. | 
|  | // If set to 0, the cipher primitive is replicated, and together with a wider nonce input, | 
|  | // a unique keystream is generated for the full data width. | 
|  | parameter  bit ReplicateKeyStream  = 1'b0, | 
|  | // Derived parameters | 
|  | localparam int AddrWidth           = prim_util_pkg::vbits(Depth), | 
|  | // Depending on the data width, we need to instantiate multiple parallel cipher primitives to | 
|  | // create a keystream that is wide enough (PRINCE has a block size of 64bit) | 
|  | localparam int NumParScr           = (ReplicateKeyStream) ? 1 : (Width + 63) / 64, | 
|  | localparam int NumParKeystr        = (ReplicateKeyStream) ? (Width + 63) / 64 : 1, | 
|  | // This is given by the PRINCE cipher primitive. All parallel cipher modules | 
|  | // use the same key, but they use a different IV | 
|  | localparam int DataKeyWidth        = 128, | 
|  | // Each 64 bit scrambling primitive requires a 64bit IV | 
|  | localparam int NonceWidth          = 64 * NumParScr | 
|  | ) ( | 
|  | input                             clk_i, | 
|  | input                             rst_ni, | 
|  |  | 
|  | // Key interface. Memory requests will not be granted if key_valid is set to 0. | 
|  | input                             key_valid_i, | 
|  | input        [DataKeyWidth-1:0]   key_i, | 
|  | input        [NonceWidth-1:0]     nonce_i, | 
|  |  | 
|  | // Interface to TL-UL SRAM adapter | 
|  | input                             req_i, | 
|  | output logic                      gnt_o, | 
|  | input                             write_i, | 
|  | input        [AddrWidth-1:0]      addr_i, | 
|  | input        [Width-1:0]          wdata_i, | 
|  | input        [Width-1:0]          wmask_i,  // Needs to be byte-aligned for parity | 
|  | // On integrity errors, the primitive surpresses any real transaction to the memory. | 
|  | input                             intg_error_i, | 
|  | output logic [Width-1:0]          rdata_o, | 
|  | output logic                      rvalid_o, // Read response (rdata_o) is valid | 
|  | output logic [1:0]                rerror_o, // Bit1: Uncorrectable, Bit0: Correctable | 
|  | output logic [31:0]               raddr_o,  // Read address for error reporting. | 
|  |  | 
|  | // config | 
|  | input ram_1p_cfg_t                cfg_i | 
|  | ); | 
|  |  | 
|  | ////////////////////// | 
|  | // Parameter Checks // | 
|  | ////////////////////// | 
|  |  | 
|  | // The depth needs to be a power of 2 in case address scrambling is turned on | 
|  | `ASSERT_INIT(DepthPow2Check_A, NumAddrScrRounds <= '0 || 2**$clog2(Depth) == Depth) | 
|  | `ASSERT_INIT(DiffWidthMinimum_A, DiffWidth >= 4) | 
|  | `ASSERT_INIT(DiffWidthWithParity_A, EnableParity && (DiffWidth == 8) || !EnableParity) | 
|  |  | 
|  | ///////////////////////////////////////// | 
|  | // Pending Write and Address Registers // | 
|  | ///////////////////////////////////////// | 
|  |  | 
|  | // Writes are delayed by one cycle, such the same keystream generation primitive (prim_prince) can | 
|  | // be reused among reads and writes. Note however that with this arrangement, we have to introduce | 
|  | // a mechanism to hold a pending write transaction in cases where that transaction is immediately | 
|  | // followed by a read. The pending write transaction is written to memory as soon as there is no | 
|  | // new read transaction incoming. The latter can be a special case if the incoming read goes to | 
|  | // the same address as the pending write. To that end, we detect the address collision and return | 
|  | // the data from the write holding register. | 
|  |  | 
|  | // Read / write strobes | 
|  | logic read_en, write_en_d, write_en_q; | 
|  | assign gnt_o = req_i & key_valid_i; | 
|  |  | 
|  | assign read_en = gnt_o & ~write_i; | 
|  | assign write_en_d = gnt_o & write_i; | 
|  |  | 
|  | logic write_pending_q; | 
|  | logic addr_collision_d, addr_collision_q; | 
|  | logic [AddrWidth-1:0] addr_scr; | 
|  | logic [AddrWidth-1:0] waddr_scr_q; | 
|  | assign addr_collision_d = read_en & (write_en_q | write_pending_q) & (addr_scr == waddr_scr_q); | 
|  |  | 
|  | // Macro requests and write strobe | 
|  | // The macro operation is silenced if an integrity error is seen | 
|  | logic intg_error_buf, intg_error_w_q; | 
|  | prim_buf u_intg_error ( | 
|  | .in_i(intg_error_i), | 
|  | .out_o(intg_error_buf) | 
|  | ); | 
|  | logic macro_req; | 
|  | assign macro_req   = ~intg_error_w_q & ~intg_error_buf & (read_en | write_en_q | write_pending_q); | 
|  | // We are allowed to write a pending write transaction to the memory if there is no incoming read. | 
|  | logic macro_write; | 
|  | assign macro_write = (write_en_q | write_pending_q) & ~read_en & ~intg_error_w_q; | 
|  | // New read write collision | 
|  | logic rw_collision; | 
|  | assign rw_collision = write_en_q & read_en; | 
|  |  | 
|  | //////////////////////// | 
|  | // Address Scrambling // | 
|  | //////////////////////// | 
|  |  | 
|  | // We only select the pending write address in case there is no incoming read transaction. | 
|  | logic [AddrWidth-1:0] addr_mux; | 
|  | assign addr_mux = (read_en) ? addr_scr : waddr_scr_q; | 
|  |  | 
|  | // This creates a bijective address mapping using a substitution / permutation network. | 
|  | if (NumAddrScrRounds > 0) begin : gen_addr_scr | 
|  | logic [AddrWidth-1:0] addr_scr_nonce; | 
|  | assign addr_scr_nonce = nonce_i[NonceWidth - AddrWidth +: AddrWidth]; | 
|  |  | 
|  | prim_subst_perm #( | 
|  | .DataWidth ( AddrWidth        ), | 
|  | .NumRounds ( NumAddrScrRounds ), | 
|  | .Decrypt   ( 0                ) | 
|  | ) u_prim_subst_perm ( | 
|  | .data_i ( addr_i         ), | 
|  | // Since the counter mode concatenates {nonce_i[NonceWidth-1-AddrWidth:0], addr} to form | 
|  | // the IV, the upper AddrWidth bits of the nonce are not used and can be used for address | 
|  | // scrambling. In cases where N parallel PRINCE blocks are used due to a data | 
|  | // width > 64bit, N*AddrWidth nonce bits are left dangling. | 
|  | .key_i  ( addr_scr_nonce ), | 
|  | .data_o ( addr_scr       ) | 
|  | ); | 
|  | end else begin : gen_no_addr_scr | 
|  | assign addr_scr = addr_i; | 
|  | end | 
|  |  | 
|  | // We latch the non-scrambled address for error reporting. | 
|  | logic [AddrWidth-1:0] raddr_q; | 
|  | assign raddr_o = 32'(raddr_q); | 
|  |  | 
|  | ////////////////////////////////////////////// | 
|  | // Keystream Generation for Data Scrambling // | 
|  | ////////////////////////////////////////////// | 
|  |  | 
|  | // This encrypts the IV consisting of the nonce and address using the key provided in order to | 
|  | // generate the keystream for the data. Note that we instantiate a register halfway within this | 
|  | // primitive to balance the delay between request and response side. | 
|  | localparam int DataNonceWidth = 64 - AddrWidth; | 
|  | logic [NumParScr*64-1:0] keystream; | 
|  | logic [NumParScr-1:0][DataNonceWidth-1:0] data_scr_nonce; | 
|  | for (genvar k = 0; k < NumParScr; k++) begin : gen_par_scr | 
|  | assign data_scr_nonce[k] = nonce_i[k * DataNonceWidth +: DataNonceWidth]; | 
|  |  | 
|  | prim_prince #( | 
|  | .DataWidth      (64), | 
|  | .KeyWidth       (128), | 
|  | .NumRoundsHalf  (NumPrinceRoundsHalf), | 
|  | .UseOldKeySched (1'b0), | 
|  | .HalfwayDataReg (1'b1), // instantiate a register halfway in the primitive | 
|  | .HalfwayKeyReg  (1'b0)  // no need to instantiate a key register as the key remains static | 
|  | ) u_prim_prince ( | 
|  | .clk_i, | 
|  | .rst_ni, | 
|  | .valid_i ( gnt_o ), | 
|  | // The IV is composed of a nonce and the row address | 
|  | //.data_i  ( {nonce_i[k * (64 - AddrWidth) +: (64 - AddrWidth)], addr} ), | 
|  | .data_i  ( {data_scr_nonce[k], addr_i} ), | 
|  | // All parallel scramblers use the same key | 
|  | .key_i, | 
|  | // Since we operate in counter mode, this can always be set to encryption mode | 
|  | .dec_i   ( 1'b0 ), | 
|  | // Output keystream to be XOR'ed | 
|  | .data_o  ( keystream[k * 64 +: 64] ), | 
|  | .valid_o ( ) | 
|  | ); | 
|  |  | 
|  | // Unread unused bits from keystream | 
|  | if (k == NumParKeystr-1 && (Width % 64) > 0) begin : gen_unread_last | 
|  | localparam int UnusedWidth = 64 - (Width % 64); | 
|  | logic [UnusedWidth-1:0] unused_keystream; | 
|  | assign unused_keystream = keystream[(k+1) * 64 - 1 -: UnusedWidth]; | 
|  | end | 
|  | end | 
|  |  | 
|  | // Replicate keystream if needed | 
|  | logic [Width-1:0] keystream_repl; | 
|  | assign keystream_repl = Width'({NumParKeystr{keystream}}); | 
|  |  | 
|  | ///////////////////// | 
|  | // Data Scrambling // | 
|  | ///////////////////// | 
|  |  | 
|  | // Data scrambling is a two step process. First, we XOR the write data with the keystream obtained | 
|  | // by operating a reduced-round PRINCE cipher in CTR-mode. Then, we diffuse data within each byte | 
|  | // in order to get a limited "avalanche" behavior in case parts of the bytes are flipped as a | 
|  | // result of a malicious attempt to tamper with the data in memory. We perform the diffusion only | 
|  | // within bytes in order to maintain the ability to write individual bytes. Note that the | 
|  | // keystream XOR is performed first for the write path such that it can be performed last for the | 
|  | // read path. This allows us to hide a part of the combinational delay of the PRINCE primitive | 
|  | // behind the propagation delay of the SRAM macro and the per-byte diffusion step. | 
|  |  | 
|  | logic [Width-1:0] rdata_scr, rdata; | 
|  | logic [Width-1:0] wdata_scr_d, wdata_scr_q, wdata_q; | 
|  | for (genvar k = 0; k < (Width + DiffWidth - 1) / DiffWidth; k++) begin : gen_diffuse_data | 
|  | // If the Width is not divisible by DiffWidth, we need to adjust the width of the last slice. | 
|  | localparam int LocalWidth = (Width - k * DiffWidth >= DiffWidth) ? DiffWidth : | 
|  | (Width - k * DiffWidth); | 
|  |  | 
|  | // Write path. Note that since this does not fan out into the interconnect, the write path is | 
|  | // not as critical as the read path below in terms of timing. | 
|  | // Apply the keystream first | 
|  | logic [LocalWidth-1:0] wdata_xor; | 
|  | assign wdata_xor = wdata_q[k*DiffWidth +: LocalWidth] ^ | 
|  | keystream_repl[k*DiffWidth +: LocalWidth]; | 
|  |  | 
|  | // Byte aligned diffusion using a substitution / permutation network | 
|  | prim_subst_perm #( | 
|  | .DataWidth ( LocalWidth       ), | 
|  | .NumRounds ( NumDiffRounds ), | 
|  | .Decrypt   ( 0                ) | 
|  | ) u_prim_subst_perm_enc ( | 
|  | .data_i ( wdata_xor ), | 
|  | .key_i  ( '0        ), | 
|  | .data_o ( wdata_scr_d[k*DiffWidth +: LocalWidth] ) | 
|  | ); | 
|  |  | 
|  | // Read path. This is timing critical. The keystream XOR operation is performed last in order to | 
|  | // hide the combinational delay of the PRINCE primitive behind the propagation delay of the | 
|  | // SRAM and the byte diffusion. | 
|  | // Reverse diffusion first | 
|  | logic [LocalWidth-1:0] rdata_xor; | 
|  | prim_subst_perm #( | 
|  | .DataWidth ( LocalWidth       ), | 
|  | .NumRounds ( NumDiffRounds ), | 
|  | .Decrypt   ( 1                ) | 
|  | ) u_prim_subst_perm_dec ( | 
|  | .data_i ( rdata_scr[k*DiffWidth +: LocalWidth] ), | 
|  | .key_i  ( '0        ), | 
|  | .data_o ( rdata_xor ) | 
|  | ); | 
|  |  | 
|  | // Apply Keystream, replicate it if needed | 
|  | assign rdata[k*DiffWidth +: LocalWidth] = rdata_xor ^ | 
|  | keystream_repl[k*DiffWidth +: LocalWidth]; | 
|  | end | 
|  |  | 
|  | //////////////////////////////////////////////// | 
|  | // Scrambled data register and forwarding mux // | 
|  | //////////////////////////////////////////////// | 
|  |  | 
|  | // This is the scrambled data holding register for pending writes. This is needed in order to make | 
|  | // back to back patterns of the form WR -> RD -> WR work: | 
|  | // | 
|  | // cycle:          0   |  1   | 2   | 3   | | 
|  | // incoming op:    WR0 |  RD  | WR1 | -   | | 
|  | // prince:         -   |  WR0 | RD  | WR1 | | 
|  | // memory op:      -   |  RD  | WR0 | WR1 | | 
|  | // | 
|  | // The read transaction in cycle 1 interrupts the first write transaction which has already used | 
|  | // the PRINCE primitive for scrambling. If this sequence is followed by another write back-to-back | 
|  | // in cycle 2, we cannot use the PRINCE primitive a second time for the first write, and hence | 
|  | // need an additional holding register that can buffer the scrambled data of the first write in | 
|  | // cycle 1. | 
|  |  | 
|  | // Clear this if we can write the memory in this cycle. Set only if the current write cannot | 
|  | // proceed due to an incoming read operation. | 
|  | logic write_scr_pending_d; | 
|  | assign write_scr_pending_d = (macro_write)  ? 1'b0 : | 
|  | (rw_collision) ? 1'b1 : | 
|  | write_pending_q; | 
|  |  | 
|  | // Select the correct scrambled word to be written, based on whether the word in the scrambled | 
|  | // data holding register is valid or not. Note that the write_scr_q register could in theory be | 
|  | // combined with the wdata_q register. We don't do that here for timing reasons, since that would | 
|  | // require another read data mux to inject the scrambled data into the read descrambling path. | 
|  | logic [Width-1:0] wdata_scr; | 
|  | assign wdata_scr = (write_pending_q) ? wdata_scr_q : wdata_scr_d; | 
|  |  | 
|  | logic rvalid_q; | 
|  | logic intg_error_r_q; | 
|  | logic [Width-1:0] wmask_q; | 
|  | always_comb begin : p_forward_mux | 
|  | rdata_o = '0; | 
|  | rvalid_o = 1'b0; | 
|  | // Kill the read response in case an integrity error was seen. | 
|  | if (!intg_error_r_q && rvalid_q) begin | 
|  | rvalid_o = 1'b1; | 
|  | // In case of a collision, we forward the valid bytes of the write data from the unscrambled | 
|  | // holding register. | 
|  | if (addr_collision_q) begin | 
|  | for (int k = 0; k < Width; k++) begin | 
|  | if (wmask_q[k]) begin | 
|  | rdata_o[k] = wdata_q[k]; | 
|  | end else begin | 
|  | rdata_o[k] = rdata[k]; | 
|  | end | 
|  | end | 
|  | // regular reads. note that we just return zero in case | 
|  | // an integrity error was signalled. | 
|  | end else begin | 
|  | rdata_o = rdata; | 
|  | end | 
|  | end | 
|  | end | 
|  |  | 
|  | /////////////// | 
|  | // Registers // | 
|  | /////////////// | 
|  |  | 
|  | always_ff @(posedge clk_i or negedge rst_ni) begin : p_wdata_buf | 
|  | if (!rst_ni) begin | 
|  | write_pending_q     <= 1'b0; | 
|  | addr_collision_q    <= 1'b0; | 
|  | rvalid_q            <= 1'b0; | 
|  | write_en_q          <= 1'b0; | 
|  | intg_error_r_q      <= 1'b0; | 
|  | intg_error_w_q      <= 1'b0; | 
|  | raddr_q             <= '0; | 
|  | waddr_scr_q         <= '0; | 
|  | wmask_q             <= '0; | 
|  | wdata_q             <= '0; | 
|  | wdata_scr_q         <= '0; | 
|  | end else begin | 
|  | write_pending_q     <= write_scr_pending_d; | 
|  | addr_collision_q    <= addr_collision_d; | 
|  | rvalid_q            <= read_en; | 
|  | write_en_q          <= write_en_d; | 
|  | intg_error_r_q      <= intg_error_buf; | 
|  |  | 
|  | if (read_en) begin | 
|  | raddr_q <= addr_i; | 
|  | end | 
|  | if (write_en_d) begin | 
|  | waddr_scr_q    <= addr_scr; | 
|  | wmask_q        <= wmask_i; | 
|  | wdata_q        <= wdata_i; | 
|  | intg_error_w_q <= intg_error_buf; | 
|  | end | 
|  | if (rw_collision) begin | 
|  | wdata_scr_q <= wdata_scr_d; | 
|  | end | 
|  | end | 
|  | end | 
|  |  | 
|  | ////////////////// | 
|  | // Memory Macro // | 
|  | ////////////////// | 
|  |  | 
|  | prim_ram_1p_adv #( | 
|  | .Depth(Depth), | 
|  | .Width(Width), | 
|  | .DataBitsPerMask(DataBitsPerMask), | 
|  | .EnableECC(1'b0), | 
|  | .EnableParity(EnableParity), | 
|  | .EnableInputPipeline(1'b0), | 
|  | .EnableOutputPipeline(1'b0) | 
|  | ) u_prim_ram_1p_adv ( | 
|  | .clk_i, | 
|  | .rst_ni, | 
|  | .req_i    ( macro_req   ), | 
|  | .write_i  ( macro_write ), | 
|  | .addr_i   ( addr_mux    ), | 
|  | .wdata_i  ( wdata_scr   ), | 
|  | .wmask_i  ( wmask_q     ), | 
|  | .rdata_o  ( rdata_scr   ), | 
|  | .rvalid_o ( ), | 
|  | .rerror_o, | 
|  | .cfg_i | 
|  | ); | 
|  |  | 
|  | `include "prim_util_get_scramble_params.svh" | 
|  |  | 
|  | endmodule : prim_ram_1p_scr |