blob: 8bfef62809046efafe46a884167c8649de691417 [file] [log] [blame]
// Copyright lowRISC contributors.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// This is a draft implementation of a low-latency memory scrambling mechanism.
//
// The module is implemented as a primitive, in the same spirit as similar prim_ram_1p_adv wrappers.
// Hence, it can be conveniently instantiated by comportable IPs (such as OTBN) or in top_earlgrey
// for the main system memory.
//
// The currently implemented architecture uses a reduced-round PRINCE cipher primitive in CTR mode
// in order to (weakly) scramble the data written to the memory macro. Plain CTR mode does not
// diffuse the data since the keystream is just XOR'ed onto it, hence we also we perform byte-wise
// diffusion using a (shallow) substitution/permutation network layers in order to provide a limited
// avalanche effect within a byte.
//
// In order to break the linear addressing space, the address is passed through a bijective
// scrambling function constructed using a (shallow) substitution/permutation and a nonce. Due to
// that nonce, the address mapping is not fully baked into RTL and can be changed at runtime as
// well.
//
// See also: prim_cipher_pkg, prim_prince
`include "prim_assert.sv"
module prim_ram_1p_scr #(
parameter int Depth = 512, // Needs to be a power of 2 if NumAddrScrRounds > 0.
parameter int Width = 256, // Needs to be byte aligned for parity
parameter int DataBitsPerMask = 8, // Currently only 8 is supported
parameter int CfgWidth = 8, // WTC, RTC, etc
// Scrambling parameters. Note that this needs to be low-latency, hence we have to keep the
// amount of cipher rounds low. PRINCE has 5 half rounds in its original form, which corresponds
// to 2*5 + 1 effective rounds. Setting this to 2 halves this to approximately 5 effective rounds.
parameter int NumPrinceRoundsHalf = 2, // Number of PRINCE half rounds, can be [1..5]
// Number of extra intra-byte diffusion rounds. Setting this to 0 disables intra-byte diffusion.
parameter int NumByteScrRounds = 2,
// Number of address scrambling rounds. Setting this to 0 disables address scrambling.
parameter int NumAddrScrRounds = 2,
// If set to 1, the same 64bit key stream is replicated if the data port is wider than 64bit.
// If set to 0, the cipher primitive is replicated, and together with a wider nonce input,
// a unique keystream is generated for the full data width.
parameter bit ReplicateKeyStream = 1'b0,
// Derived parameters
localparam int AddrWidth = prim_util_pkg::vbits(Depth),
// Depending on the data width, we need to instantiate multiple parallel cipher primitives to
// create a keystream that is wide enough (PRINCE has a block size of 64bit)
localparam int NumParScr = (ReplicateKeyStream) ? 1 : (Width + 63) / 64,
localparam int NumParKeystr = (ReplicateKeyStream) ? (Width + 63) / 64 : 1,
// This is given by the PRINCE cipher primitive. All parallel cipher modules
// use the same key, but they use a different IV
localparam int DataKeyWidth = 128,
// Each 64 bit scrambling primitive requires a 64bit IV
localparam int NonceWidth = 64 * NumParScr
) (
input clk_i,
input rst_ni,
// Key interface. Memory requests will not be granted if key_valid is set to 0.
input key_valid_i,
input [DataKeyWidth-1:0] key_i,
input [NonceWidth-1:0] nonce_i,
// Interface to TL-UL SRAM adapter
input req_i,
output logic gnt_o,
input write_i,
input [AddrWidth-1:0] addr_i,
input [Width-1:0] wdata_i,
input [Width-1:0] wmask_i, // Needs to be byte-aligned for parity
output logic [Width-1:0] rdata_o,
output logic rvalid_o, // Read response (rdata_o) is valid
output logic [1:0] rerror_o, // Bit1: Uncorrectable, Bit0: Correctable
output logic [31:0] raddr_o, // Read address for error reporting.
// config
input [CfgWidth-1:0] cfg_i
);
//////////////////////
// Parameter Checks //
//////////////////////
// The depth needs to be a power of 2 in case address scrambling is turned on
`ASSERT_INIT(DepthPow2Check_A, NumAddrScrRounds <= '0 || 2**$clog2(Depth) == Depth)
/////////////////////////////////////////
// Pending Write and Address Registers //
/////////////////////////////////////////
// Writes are delayed by one cycle, such the same keystream generation primitive (prim_prince) can
// be reused among reads and writes. Note however that with this arrangement, we have to introduce
// a mechanism to hold a pending write transaction in cases where that transaction is immediately
// followed by a read. The pending write transaction is written to memory as soon as there is no
// new read transaction incoming. The latter can be a special case if the incoming read goes to
// the same address as the pending write. To that end, we detect the address collision and return
// the data from the write holding register.
// Read / write strobes
logic read_en, write_en_d, write_en_q;
assign gnt_o = req_i & key_valid_i;
assign read_en = gnt_o & ~write_i;
assign write_en_d = gnt_o & write_i;
logic write_pending_q;
logic addr_collision_d, addr_collision_q;
logic [AddrWidth-1:0] waddr_q;
assign addr_collision_d = read_en & (write_en_q | write_pending_q) & (addr_i == waddr_q);
// Macro requests and write strobe
logic macro_req;
assign macro_req = read_en | write_en_q | write_pending_q;
// We are allowed to write a pending write transaction to the memory if there is no incoming read
logic macro_write;
assign macro_write = (write_en_q | write_pending_q) & ~read_en;
// New read write collision
logic rw_collision;
assign rw_collision = write_en_q & read_en;
////////////////////////
// Address Scrambling //
////////////////////////
// We only select the pending write address in case there is no incoming read transaction.
logic [AddrWidth-1:0] addr_mux;
assign addr_mux = (read_en) ? addr_i : waddr_q;
// This creates a bijective address mapping using a substitution / permutation network.
logic [AddrWidth-1:0] addr_scr;
if (NumAddrScrRounds > 0) begin : gen_addr_scr
prim_subst_perm #(
.DataWidth ( AddrWidth ),
.NumRounds ( NumAddrScrRounds ),
.Decrypt ( 0 )
) u_prim_subst_perm (
.data_i ( addr_mux ),
// Since the counter mode concatenates {nonce_i[NonceWidth-1-AddrWidth:0], addr_i} to form
// the IV, the upper AddrWidth bits of the nonce are not used and can be used for address
// scrambling. In cases where N parallel PRINCE blocks are used due to a data
// width > 64bit, N*AddrWidth nonce bits are left dangling.
.key_i ( nonce_i[NonceWidth - 1 : NonceWidth - AddrWidth] ),
.data_o ( addr_scr )
);
end else begin : gen_no_addr_scr
assign addr_scr = addr_mux;
end
// We latch the non-scrambled address for error reporting.
logic [AddrWidth-1:0] raddr_q;
assign raddr_o = 32'(raddr_q);
//////////////////////////////////////////////
// Keystream Generation for Data Scrambling //
//////////////////////////////////////////////
// This encrypts the IV consisting of the nonce and address using the key provided in order to
// generate the keystream for the data. Note that we instantiate a register halfway within this
// primitive to balance the delay between request and response side.
logic [NumParScr*64-1:0] keystream;
for (genvar k = 0; k < NumParScr; k++) begin : gen_par_scr
prim_prince #(
.DataWidth (64),
.KeyWidth (128),
.NumRoundsHalf (NumPrinceRoundsHalf),
.UseOldKeySched (1'b0),
.HalfwayDataReg (1'b1), // instantiate a register halfway in the primitive
.HalfwayKeyReg (1'b0) // no need to instantiate a key register as the key remains static
) u_prim_prince (
.clk_i,
.rst_ni,
.valid_i ( gnt_o ),
// The IV is composed of a nonce and the row address
.data_i ( {nonce_i[k * (64 - AddrWidth) +: (64 - AddrWidth)], addr_i} ),
// All parallel scramblers use the same key
.key_i,
// Since we operate in counter mode, this can always be set to encryption mode
.dec_i ( 1'b0 ),
// Output keystream to be XOR'ed
.data_o ( keystream[k * 64 +: 64] ),
.valid_o ( )
);
// Unread unused bits from keystream
if (k == NumParKeystr-1 && (Width % 64) > 0) begin : gen_unread_last
localparam int UnusedWidth = 64 - (Width % 64);
logic [UnusedWidth-1:0] unused_keystream;
assign unused_keystream = keystream[(k+1) * 64 - 1 -: UnusedWidth];
end
end
// Replicate keystream if needed
logic [Width-1:0] keystream_repl;
assign keystream_repl = Width'({NumParKeystr{keystream}});
/////////////////////
// Data Scrambling //
/////////////////////
// Data scrambling is a two step process. First, we XOR the write data with the keystream obtained
// by operating a reduced-round PRINCE cipher in CTR-mode. Then, we diffuse data within each byte
// in order to get a limited "avalanche" behavior in case parts of the bytes are flipped as a
// result of a malicious attempt to tamper with the data in memory. We perform the diffusion only
// within bytes in order to maintain the ability to write individual bytes. Note that the
// keystream XOR is performed first for the write path such that it can be performed last for the
// read path. This allows us to hide a part of the combinational delay of the PRINCE primitive
// behind the propagation delay of the SRAM macro and the per-byte diffusion step.
// Write path. Note that since this does not fan out into the interconnect, the write path is not
// as critical as the read path below in terms of timing.
logic [Width-1:0] wdata_scr_d, wdata_scr_q, wdata_q;
for (genvar k = 0; k < Width/8; k++) begin : gen_diffuse_wdata
// Apply the keystream first
logic [7:0] wdata_xor;
assign wdata_xor = wdata_q[k*8 +: 8] ^ keystream_repl[k*8 +: 8];
// byte aligned diffusion using a substitution / permutation network
prim_subst_perm #(
.DataWidth ( 8 ),
.NumRounds ( NumByteScrRounds ),
.Decrypt ( 0 )
) u_prim_subst_perm (
.data_i ( wdata_xor ),
.key_i ( '0 ),
.data_o ( wdata_scr_d[k*8 +: 8] )
);
end
// Read path. This is timing critical. The keystream XOR operation is performed last in order to
// hide the combinational delay of the PRINCE primitive behind the propagation delay of the
// SRAM and the byte diffusion.
logic [Width-1:0] rdata_scr, rdata;
for (genvar k = 0; k < Width/8; k++) begin : gen_undiffuse_rdata
// Reverse diffusion first
logic [7:0] rdata_xor;
prim_subst_perm #(
.DataWidth ( 8 ),
.NumRounds ( NumByteScrRounds ),
.Decrypt ( 1 )
) u_prim_subst_perm (
.data_i ( rdata_scr[k*8 +: 8] ),
.key_i ( '0 ),
.data_o ( rdata_xor )
);
// Apply Keystream, replicate it if needed
assign rdata[k*8 +: 8] = rdata_xor ^ keystream_repl[k*8 +: 8];
end
////////////////////////////////////////////////
// Scrambled data register and forwarding mux //
////////////////////////////////////////////////
// This is the scrambled data holding register for pending writes. This is needed in order to make
// back to back patterns of the form WR -> RD -> WR work:
//
// cycle: 0 | 1 | 2 | 3 |
// incoming op: WR0 | RD | WR1 | - |
// prince: - | WR0 | RD | WR1 |
// memory op: - | RD | WR0 | WR1 |
//
// The read transaction in cycle 1 interrupts the first write transaction which has already used
// the PRINCE primitive for scrambling. If this sequence is followed by another write back-to-back
// in cycle 2, we cannot use the PRINCE primitive a second time for the first write, and hence
// need an additional holding register that can buffer the scrambled data of the first write in
// cycle 1.
// Clear this if we can write the memory in this cycle. Set only if the current write cannot
// proceed due to an incoming read operation.
logic write_scr_pending_d;
assign write_scr_pending_d = (macro_write) ? 1'b0 :
(rw_collision) ? 1'b1 :
write_pending_q;
// Select the correct scrambled word to be written, based on whether the word in the scrambled
// data holding register is valid or not. Note that the write_scr_q register could in theory be
// combined with the wdata_q register. We don't do that here for timing reasons, since that would
// require another read data mux to inject the scrambled data into the read descrambling path.
logic [Width-1:0] wdata_scr;
assign wdata_scr = (write_pending_q) ? wdata_scr_q : wdata_scr_d;
// Output read valid strobe
logic rvalid_q;
assign rvalid_o = rvalid_q;
// In case of a collision, we forward the write data from the unscrambled holding register
assign rdata_o = (addr_collision_q) ? wdata_q : // forward pending (unscrambled) write data
(rvalid_q) ? rdata : // regular reads
'0; // tie to zero otherwise
///////////////
// Registers //
///////////////
logic [Width-1:0] wmask_q;
always_ff @(posedge clk_i or negedge rst_ni) begin : p_wdata_buf
if (!rst_ni) begin
write_pending_q <= 1'b0;
addr_collision_q <= 1'b0;
rvalid_q <= 1'b0;
write_en_q <= 1'b0;
raddr_q <= '0;
waddr_q <= '0;
wmask_q <= '0;
wdata_q <= '0;
wdata_scr_q <= '0;
end else begin
write_pending_q <= write_scr_pending_d;
addr_collision_q <= addr_collision_d;
rvalid_q <= read_en;
write_en_q <= write_en_d;
if (read_en) begin
raddr_q <= addr_i;
end
if (write_en_d) begin
waddr_q <= addr_i;
wmask_q <= wmask_i;
wdata_q <= wdata_i;
end
if (rw_collision) begin
wdata_scr_q <= wdata_scr_d;
end
end
end
//////////////////
// Memory Macro //
//////////////////
prim_ram_1p_adv #(
.Depth(Depth),
.Width(Width),
.DataBitsPerMask(DataBitsPerMask),
.CfgW(CfgWidth),
.EnableECC(1'b0),
.EnableParity(1'b1), // We are using byte parity
.EnableInputPipeline(1'b0),
.EnableOutputPipeline(1'b0)
) u_prim_ram_1p_adv (
.clk_i,
.rst_ni,
.req_i ( macro_req ),
.write_i ( macro_write ),
.addr_i ( addr_scr ),
.wdata_i ( wdata_scr ),
.wmask_i ( wmask_q ),
.rdata_o ( rdata_scr ),
.rvalid_o ( ),
.rerror_o,
.cfg_i
);
endmodule : prim_ram_1p_scr