// Copyright lowRISC contributors.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
// This module is the single round keccak permutation module
// It supports Keccak with up to 1600b of state
`include ""
module keccak_2share
import prim_mubi_pkg::*;
parameter int Width = 1600, // b= {25, 50, 100, 200, 400, 800, 1600}
// Derived
localparam int W = Width/25,
localparam int L = $clog2(W),
localparam int MaxRound = 12 + 2*L, // Keccak-f only
localparam int RndW = $clog2(MaxRound+1), // Representing up to MaxRound
// Control parameters
parameter bit EnMasking = 0, // Enable secure hardening
localparam int Share = EnMasking ? 2 : 1
) (
input clk_i,
input rst_ni,
input lc_ctrl_pkg::lc_tx_t lc_escalate_en_i, // Used to disable SVAs when escalating.
input [RndW-1:0] rnd_i, // Current round index
input mubi4_t phase_sel_i, // Output mux contol. Used when EnMasking := 1
input [1:0] cycle_i, // Current cycle index. Used when EnMasking := 1
input rand_aux_i, // Auxiliary randomness input. Used when EnMasking := 1
input [Width/2-1:0] rand_i, // Randomness for remasking. Used when EnMasking := 1
input [Width-1:0] s_i [Share],
output logic [Width-1:0] s_o [Share]
// Types //
// x y z
typedef logic [4:0][4:0][W-1:0] box_t; // (x,y,z) state
typedef logic [W-1:0] lane_t; // (z)
typedef logic [4:0] [W-1:0] plane_t; // (x,z)
typedef logic [4:0][4:0] slice_t; // (x,y)
typedef logic [4:0][W-1:0] sheet_t; // (y,z) identical to plane_t
typedef logic [4:0] row_t; // (x)
typedef logic [4:0] col_t; // (y) identical to row_t
// Keccak_f //
box_t state_in [Share];
box_t state_out [Share];
box_t theta_data [Share];
box_t rho_data [Share];
box_t pi_data [Share];
box_t chi_data [Share];
box_t iota_data [Share];
box_t phase1_in [Share];
box_t phase1_out [Share];
box_t phase2_in [Share];
box_t phase2_out [Share];
// Unused nets //
// Tie off input signals that aren't used in the unmasked implementation.
if (!EnMasking) begin : gen_tie_unused
logic unused_clk;
logic unused_rst_n;
mubi4_t unused_phase_sel;
logic [1:0] unused_cycle;
logic unused_rand_aux;
logic [Width/2-1:0] unused_rand;
assign unused_clk = clk_i;
assign unused_rst_n = rst_ni;
assign unused_phase_sel = phase_sel_i;
assign unused_cycle = cycle_i;
assign unused_rand_aux = rand_aux_i;
assign unused_rand = rand_i;
// Input/output type conversion and interfacing //
for (genvar i = 0 ; i < Share ; i++) begin : g_state_inout
assign state_in[i] = bitarray_to_box(s_i[i]);
assign s_o[i] = box_to_bitarray(state_out[i]);
end : g_state_inout
if (EnMasking) begin : g_2share_data
assign phase1_in = state_in;
assign phase2_in = state_in;
always_comb begin
unique case (phase_sel_i)
MuBi4False: state_out = phase1_out;
MuBi4True: state_out = phase2_out;
default: state_out = phase1_out;
end else begin : g_single_data
assign phase1_in = state_in;
assign phase2_in = phase1_out;
assign state_out = phase2_out;
// Datapath //
// This module has two phases. First phase, it calculates Theta, Rho, Pi steps
// in SHA3. At the second phase, it computes Chi and Iota steps. If masking is
// not enabled, the two phases are completed within a single clock cycle.
// If masking is enabled, the first phase (Phase1) completes in one cycle.
// Then, the output should be stored in the state and given to the input of
// this module again. The second phase in the masked version needs three
// clock cycles to complete. In the first clock cycle, the first stage of Chi
// is computed for the first lane halves. In the second clock cycle, the
// module then outputs the updated first lane halves. In the third clock
// cycle, the new second lane halves are output. To aggravate SCA, we
// randomly decide which lane halves to process first on a per-round basis.
// We use additional randomness generated by the PRNG to take this decision
// (rand_aux_i). For more details, refer to the comments in the "MUX control"
// section below.
for (genvar i = 0 ; i < Share ; i++) begin : g_datapath
// Phase 1:
assign theta_data[i] = theta(phase1_in[i]);
// Commented out rho function as vcs complains z-Offset%W isn't constant
// assign rho_data[i] = rho(theta_data[i]);
assign pi_data[i] = pi(rho_data[i]);
// Phase 2 (Cycles 1, 2 and 3):
// Chi : See below
// Iota: See below
end : g_datapath
assign phase1_out = pi_data;
// Iota adds Round Constants(RC), so only one share should be XORed
if (EnMasking) begin : g_2share_iota
assign iota_data[0] = iota(chi_data[0], rnd_i);
assign iota_data[1] = chi_data[1];
end else begin : g_single_iota
assign iota_data[0] = iota(chi_data[0], rnd_i);
if (EnMasking) begin : g_2share_chi
// Domain-Oriented Masking
// reference:
localparam int unsigned WSheetHalf = $bits(sheet_t)/2;
logic [4:0][WSheetHalf-1:0] in_prd, out_prd;
logic low_then_high_d, low_then_high_q;
logic in_data_low, out_data_low;
logic in_rand_ext;
logic update_dom;
// MUX control //
// Update lane-half processing order in Phase 1 and keep the value constant
// for the entire round.
assign low_then_high_d =
mubi4_test_false_strict(phase_sel_i) ? rand_aux_i : low_then_high_q;
always_ff @(posedge clk_i or negedge rst_ni) begin
if (!rst_ni) begin
low_then_high_q <= 1'b 0;
end else begin
low_then_high_q <= low_then_high_d;
// This implementation uses both randomness provided from an external PRNG
// as well as intermediate results for remasking the DOM multipliers below.
// Per clock cycle, 800b of pseudo-random data (PRD) are required. The
// following schedule is used to only ever update the input data when also
// providing fresh randomness and vice versa.
// Cycle 0: Compute Theta, Rho, Pi - The DOM multipliers are not evaluated
// at all: the inputs are driven by the first lane halves (same
// values as in Cycle 3). Also the intermediate results we already
// had in Cycle 3 didn't change.
// Cycle 1: Compute first stage of Chi for first lane halves using the DOM
// multipliers. We use the fresh randomness provided from the
// PRNG for remasking.
// Cycle 2: Compute second stage of Chi and Iota for first lane halves.
// Compute first stage of Chi for second lane halves. We use the
// fresh randomness provided from the PRNG for remasking the
// DOM multipliers.
// Cycle 3: Compute second stage of Chi and Iota for second lane halves.
// Feed again first lane halves to DOM multiplier inputs (now
// the updated values become visible) together with intermediate
// results of Cycle 2. Don't update the register stage inside
// the DOM multipliers.
always_comb begin
unique case (cycle_i)
2'h0: begin
in_data_low = low_then_high_q;
in_rand_ext = 1'b0;
update_dom = 1'b0;
2'h1: begin
in_data_low = low_then_high_q;
in_rand_ext = 1'b1;
update_dom = 1'b1;
2'h2: begin
in_data_low = ~low_then_high_q;
in_rand_ext = 1'b1;
update_dom = 1'b1;
2'h3: begin
in_data_low = low_then_high_q;
in_rand_ext = 1'b0;
update_dom = 1'b0;
default: begin
in_data_low = low_then_high_q;
in_rand_ext = 1'b0;
update_dom = 1'b0;
// When taking the lower lane halves in, the upper lane halves are output
// and vice versa.
assign out_data_low = ~in_data_low;
// DOM multipliers //
for (genvar x = 0 ; x < 5 ; x++) begin : g_chi_w
localparam int X1 = (x + 1) % 5;
localparam int X2 = (x + 2) % 5;
sheet_t sheet0[Share]; // Inverted input X1
sheet_t sheet1[Share]; // X2
sheet_t sheet2[Share]; // DOM output
assign sheet0[0] = ~phase2_in[0][X1];
assign sheet0[1] = phase2_in[1][X1];
assign sheet1[0] = phase2_in[0][X2];
assign sheet1[1] = phase2_in[1][X2];
// Convert sheet_t to 1D arrays, one for the upper and lower half lane.
logic [WSheetHalf-1:0] a0_l, a1_l, b0_l, b1_l;
logic [WSheetHalf-1:0] a0_h, a1_h, b0_h, b1_h;
logic [WSheetHalf-1:0] a0, a1, b0, b1, q0, q1;
assign a0_l = {sheet0[0][0][W/2-1:0],
assign a1_l = {sheet0[1][0][W/2-1:0],
assign a0_h = {sheet0[0][0][W-1:W/2],
assign a1_h = {sheet0[1][0][W-1:W/2],
assign b0_l = {sheet1[0][0][W/2-1:0],
assign b1_l = {sheet1[1][0][W/2-1:0],
assign b0_h = {sheet1[0][0][W-1:W/2],
assign b1_h = {sheet1[1][0][W-1:W/2],
// Input muxing
assign a0 = in_data_low ? a0_l : a0_h;
assign a1 = in_data_low ? a1_l : a1_h;
assign b0 = in_data_low ? b0_l : b0_h;
assign b1 = in_data_low ? b1_l : b1_h;
// Randomness muxing
// Intermediate results are rotated across rows. The new Row x depends on
// data from Rows x + 1 and x + 2. Hence we don't want to use intermediate
// results from Rows x, x + 1, and x + 2 for remasking.
assign in_prd[x] = in_rand_ext ? rand_i[x * WSheetHalf +: WSheetHalf] :
out_prd[rot_int(x, 5)];
prim_dom_and_2share #(
.DW (WSheetHalf), // a half sheet
.Pipeline(1) // Process the full sheet in 3 clock cycles. This reduces
// SCA leakage.
) u_dom (
.a0_i (a0),
.a1_i (a1),
.b0_i (b0),
.b1_i (b1),
.z_valid_i (update_dom),
.z_i (in_prd[x]),
.q0_o (q0),
.q1_o (q1),
.prd_o (out_prd[x])
// Output conversion from q0, q1 to sheet_t
// For simplicity, we forward the generated lane half to both the upper
// and lower lane halves at this point. The actual output muxing/selection
// happens after the Iota step when generating phase2_out from iota_data
// and state_in below.
assign sheet2[0][4] = {2{q0[W/2*0+:W/2]}};
assign sheet2[0][3] = {2{q0[W/2*1+:W/2]}};
assign sheet2[0][2] = {2{q0[W/2*2+:W/2]}};
assign sheet2[0][1] = {2{q0[W/2*3+:W/2]}};
assign sheet2[0][0] = {2{q0[W/2*4+:W/2]}};
assign sheet2[1][4] = {2{q1[W/2*0+:W/2]}};
assign sheet2[1][3] = {2{q1[W/2*1+:W/2]}};
assign sheet2[1][2] = {2{q1[W/2*2+:W/2]}};
assign sheet2[1][1] = {2{q1[W/2*3+:W/2]}};
assign sheet2[1][0] = {2{q1[W/2*4+:W/2]}};
// Final XOR to generate the output
assign chi_data[0][x] = sheet2[0] ^ phase2_in[0][x];
assign chi_data[1][x] = sheet2[1] ^ phase2_in[1][x];
end : g_chi_w
// Since Chi and thus Iota are separately applied to the lower and upper half
// lanes, we need to forward the input to the other half.
for (genvar x = 0 ; x < 5 ; x++) begin : g_2share_phase2_out_row
for (genvar y = 0 ; y < 5 ; y++) begin : g_2share_phase2_out_col
assign phase2_out[0][x][y] = out_data_low ?
{ state_in[0][x][y][W-1:W/2], iota_data[0][x][y][W/2-1:0]} :
{iota_data[0][x][y][W-1:W/2], state_in[0][x][y][W/2-1:0]};
assign phase2_out[1][x][y] = out_data_low ?
{ state_in[1][x][y][W-1:W/2], iota_data[1][x][y][W/2-1:0]} :
{iota_data[1][x][y][W-1:W/2], state_in[1][x][y][W/2-1:0]};
end else begin : g_single_chi
assign chi_data[0] = chi(phase2_in[0]);
assign phase2_out = iota_data;
// Rho ======================================================================
// As RhoOffset[x][y] is considered as variable int in VCS,
// it is replaced with generate statement.
// Revised to meet verilator lint. Now RhoOffset is 1-D array
localparam int RhoOffset [25] = '{
//y 0 1 2 3 4 x
0, 36, 3, 105, 210, // 0: 0 1 2 3 4
1, 300, 10, 45, 66, // 1: 5 6 7 8 9
190, 6, 171, 15, 253, // 2: 10 11 12 13 14
28, 55, 153, 21, 120, // 3: 15 16 17 18 19
91, 276, 231, 136, 78 // 4: 20 21 22 23 24
for (genvar i = 0 ; i < Share ; i++) begin : g_rho
box_t rho_in, rho_out;
assign rho_in = theta_data[i];
assign rho_data[i] = rho_out;
for (genvar x = 0 ; x < 5 ; x++) begin : gen_rho_x
for (genvar y = 0 ; y < 5 ; y++) begin : gen_rho_y
localparam int Offset = RhoOffset[5*x+y]%W;
localparam int ShiftAmt = W- Offset;
if (Offset == 0) begin : gen_offset0
assign rho_out[x][y][W-1:0] = rho_in[x][y][W-1:0];
end else begin : gen_others
assign rho_out[x][y][W-1:0] = {rho_in[x][y][0+:ShiftAmt],
end : g_rho
// Assertions //
EnMasking == 0 && Width inside {25, 50, 100, 200, 400, 800, 1600} ||
EnMasking == 1 && Width inside {50, 100, 200, 400, 800, 1600})
`ASSERT_INIT(ValidW_A, W inside {1, 2, 4, 8, 16, 32, 64})
`ASSERT_INIT(ValidL_A, L inside {0, 1, 2, 3, 4, 5, 6})
`ASSERT_INIT(ValidRound_A, MaxRound <= 24) // Keccak-f only
// phase_sel_i shall stay for two cycle after change to 1.
lc_ctrl_pkg::lc_tx_t unused_lc_sig;
assign unused_lc_sig = lc_escalate_en_i;
if (EnMasking) begin : gen_selperiod_chk
($past(phase_sel_i) == MuBi4False) && (phase_sel_i == MuBi4True)
|=> phase_sel_i == MuBi4True, clk_i, !rst_ni || lc_escalate_en_i != lc_ctrl_pkg::Off)
// Functions //
// Convert bitarray to 3D box
// Please take a look at FIPS PUB 202
// > For all triples (x,y,z) such that 0<=x<5, 0<=y<5, and 0<=z<w,
// > A[x,y,z]=S[w(5y+x)+z]
function automatic box_t bitarray_to_box(logic [Width-1:0] s_in);
automatic box_t box;
for (int y = 0 ; y < 5 ; y++) begin
for (int x = 0 ; x < 5 ; x++) begin
for (int z = 0 ; z < W ; z++) begin
box[x][y][z] = s_in[W*(5*y+x) + z];
return box;
endfunction : bitarray_to_box
// Convert 3D cube to bitarray
function automatic logic [Width-1:0] box_to_bitarray(box_t state);
automatic logic [Width-1:0] bitarray;
for (int y = 0 ; y < 5 ; y++) begin
for (int x = 0 ; x < 5 ; x++) begin
for (int z = 0 ; z < W ; z++) begin
bitarray[W*(5*y+x)+z] = state[x][y][z];
return bitarray;
endfunction : box_to_bitarray
// Rotate integer indices
function automatic integer rot_int(integer in, integer num);
integer out;
if (in == 0) begin
out = num - 1;
end else begin
out = in - 1;
return out;
// Step Mapping =============================================================
// theta
// XOR each bit in the state with the parity of two columns
// C[x,z] = A[x,0,z] ^ A[x,1,z] ^ A[x,2,z] ^ A[x,3,z] ^ A[x,4,z]
// D[x,z] = C[x-1,z] ^ C[x+1,z-1]
// theta = A[x,y,z] ^ D[x,z]
parameter int ThetaIndexX1 [5] = '{4, 0, 1, 2, 3}; // (x-1)%5
parameter int ThetaIndexX2 [5] = '{1, 2, 3, 4, 0}; // (x+1)%5
function automatic box_t theta(box_t state);
plane_t c;
plane_t d;
box_t result;
for (int x = 0 ; x < 5 ; x++) begin
c[x] = state[x][0] ^ state[x][1] ^ state[x][2] ^ state[x][3] ^ state[x][4];
for (int x = 0 ; x < 5 ; x++) begin
for (int z = 0 ; z < W ; z++) begin
int index_z;
index_z = (z == 0) ? W-1 : z-1; // (z+1)%W
d[x][z] = c[ThetaIndexX1[x]][z] ^ c[ThetaIndexX2[x]][index_z];
for (int x = 0 ; x < 5 ; x++) begin
for (int y = 0 ; y < 5 ; y++) begin
result[x][y] = state[x][y] ^ d[x];
return result;
endfunction : theta
// rho
// Commented out entire rho function due to VCS elaboration error.
// (z-RhoOffset[x][y]%W) isn't considered as a constant in VCS.
// Even changing it to W-RhoOffset[x][y]%W and assign to ShiftAmt
// creates same error.
// Offset : Look at Table 2 in FIPS PUB 202
//localparam int RhoOffset [5][5] = '{
// //y 0 1 2 3 4 x
// '{ 0, 36, 3, 105, 210},// 0
// '{ 1, 300, 10, 45, 66},// 1
// '{ 190, 6, 171, 15, 253},// 2
// '{ 28, 55, 153, 21, 120},// 3
// '{ 91, 276, 231, 136, 78} // 4
// rotate bits of each lane by offset
// 1. rho[0,0,z] = A[0,0,z]
// 2. Offset swap
// a. (x,y) := (1,0)
// b. for t [0..23]
// i. rho[x,y,z] = A[x,y,z-(t+1)(t+2)/2]
// ii. (x,y) = (y, (2x+3y))
//function automatic box_t rho(box_t state);
// box_t result;
// for (int x = 0 ; x < 5 ; x++) begin
// for (int y = 0 ; y < 5 ; y++) begin
// for (int z = 0 ; z < W ; z++) begin
// automatic int index_z;
// index_z = (z-RhoOffset[x][y])%W;
// result[x][y][z] = state[x][y][(z-RhoOffset[x][y])%W];
// end
// end
// end
// return result;
//endfunction : rho
// pi
// rearrange the position of lanes
// pi[x,y,z] = state[(x+3y),x,z]
localparam int PiRotate [5][5] = '{
//y 0 1 2 3 4 x
'{ 0, 3, 1, 4, 2},// 0
'{ 1, 4, 2, 0, 3},// 1
'{ 2, 0, 3, 1, 4},// 2
'{ 3, 1, 4, 2, 0},// 3
'{ 4, 2, 0, 3, 1} // 4
function automatic box_t pi(box_t state);
box_t result;
for (int x = 0 ; x < 5 ; x++) begin
for (int y = 0 ; y < 5 ; y++) begin
result[x][y][W-1:0] = state[PiRotate[x][y]][x][W-1:0];
return result;
endfunction : pi
// chi
// chi[x,y,z] = state[x,y,z] ^ ((state[x+1,y,z] ^ 1) & state[x+2,y,z])
parameter int ChiIndexX1 [5] = '{1, 2, 3, 4, 0}; // (x+1)%5
parameter int ChiIndexX2 [5] = '{2, 3, 4, 0, 1}; // (x+2)%5
function automatic box_t chi(box_t state);
box_t result;
for (int x = 0 ; x < 5 ; x++) begin
result[x] = state[x] ^ ((~state[ChiIndexX1[x]]) & state[ChiIndexX2[x]]);
return result;
endfunction : chi
// iota
// XOR (x,y) = (0,0) with Round Constant (RC)
// RC parameter: Precomputed by util/ Only up-to 0..L-1 is used
// RC = '0
// RC[2**j-1] = rc(j+7*rnd)
// rc(t) =
// 1. t%255 == 0 -> 1
// 2. R[0:7] = 'b10000000
// 3. for i = [1..t%255]
// a. R = 0 || R
// b. R[0] = R[0] ^ R[8]
// c. R[4] = R[4] ^ R[8]
// d. R[5] = R[5] ^ R[8]
// e. R[6] = R[6] ^ R[8]
// f. R = R[0:7]
// 4. return R[0]
// RC has L = [0..6]
// for lower L case, only chopping lower part of 64bit RC is sufficient.
localparam logic [63:0] RC [24] = '{
64'h 0000_0000_0000_0001, // Round 0
64'h 0000_0000_0000_8082, // Round 1
64'h 8000_0000_0000_808A, // Round 2
64'h 8000_0000_8000_8000, // Round 3
64'h 0000_0000_0000_808B, // Round 4
64'h 0000_0000_8000_0001, // Round 5
64'h 8000_0000_8000_8081, // Round 6
64'h 8000_0000_0000_8009, // Round 7
64'h 0000_0000_0000_008A, // Round 8
64'h 0000_0000_0000_0088, // Round 9
64'h 0000_0000_8000_8009, // Round 10
64'h 0000_0000_8000_000A, // Round 11
64'h 0000_0000_8000_808B, // Round 12
64'h 8000_0000_0000_008B, // Round 13
64'h 8000_0000_0000_8089, // Round 14
64'h 8000_0000_0000_8003, // Round 15
64'h 8000_0000_0000_8002, // Round 16
64'h 8000_0000_0000_0080, // Round 17
64'h 0000_0000_0000_800A, // Round 18
64'h 8000_0000_8000_000A, // Round 19
64'h 8000_0000_8000_8081, // Round 20
64'h 8000_0000_0000_8080, // Round 21
64'h 0000_0000_8000_0001, // Round 22
64'h 8000_0000_8000_8008 // Round 23
// iota: XOR with RC for (x,y) = (0,0)
function automatic box_t iota(box_t state, logic [RndW-1:0] rnd);
box_t result;
result = state;
result[0][0][W-1:0] = state[0][0][W-1:0] ^ RC[rnd][W-1:0];
return result;
endfunction : iota
// Round function : Rnd(A,i_r)
// Not used due to rho function issue described above.
//function automatic box_t keccak_rnd(box_t state, logic [RndW-1:0] rnd);
// box_t keccak_state;
// keccak_state = iota(chi(pi(rho(theta(state)))), rnd);
// return keccak_state;
//endfunction : keccak_rnd