hw/ip/kmac/rtl/keccak_2share.sv - 3p/lowrisc/opentitan - Git at Google

 // Copyright lowRISC contributors.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
 // This module is the single round keccak permutation module
 // It supports Keccak with up to 1600b of state

 `include "prim_assert.sv"

 module keccak_2share
   import prim_mubi_pkg::*;
 #(
   parameter int Width = 1600, // b= {25, 50, 100, 200, 400, 800, 1600}

   // Derived
   localparam int W        = Width/25,
   localparam int L        = $clog2(W),
   localparam int MaxRound = 12 + 2*L,           // Keccak-f only
   localparam int RndW     = $clog2(MaxRound+1), // Representing up to MaxRound

   // Control parameters
   parameter  bit EnMasking = 0,                // Enable secure hardening
   localparam int Share     = EnMasking ? 2 : 1
 ) (
   input clk_i,
   input rst_ni,

   input  lc_ctrl_pkg::lc_tx_t lc_escalate_en_i, // Used to disable SVAs when escalating.

   input         [RndW-1:0] rnd_i, // Current round index
   input mubi4_t            phase_sel_i, // Output mux contol. Used when EnMasking := 1
   input              [1:0] cycle_i, // Current cycle index. Used when EnMasking := 1
   input                    rand_aux_i, // Auxiliary randomness input. Used when EnMasking := 1
   input      [Width/2-1:0] rand_i, // Randomness for remasking. Used when EnMasking := 1
   input        [Width-1:0] s_i      [Share],
   output logic [Width-1:0] s_o      [Share]
 );
   ///////////
   // Types //
   ///////////
   //             x    y    z
   typedef logic [4:0][4:0][W-1:0] box_t;   // (x,y,z) state
   typedef logic           [W-1:0] lane_t;  // (z)
   typedef logic [4:0]     [W-1:0] plane_t; // (x,z)
   typedef logic [4:0][4:0]        slice_t; // (x,y)
   typedef logic      [4:0][W-1:0] sheet_t; // (y,z) identical to plane_t
   typedef logic [4:0]             row_t;   // (x)
   typedef logic      [4:0]        col_t;   // (y) identical to row_t

   //////////////
   // Keccak_f //
   //////////////
   box_t state_in   [Share];
   box_t state_out  [Share];
   box_t theta_data [Share];
   box_t rho_data   [Share];
   box_t pi_data    [Share];
   box_t chi_data   [Share];
   box_t iota_data  [Share];

   box_t phase1_in  [Share];
   box_t phase1_out [Share];
   box_t phase2_in  [Share];
   box_t phase2_out [Share];

   /////////////////
   // Unused nets //
   /////////////////
   // Tie off input signals that aren't used in the unmasked implementation.
   if (!EnMasking) begin : gen_tie_unused
     logic unused_clk;
     logic unused_rst_n;
     mubi4_t unused_phase_sel;
     logic [1:0] unused_cycle;
     logic unused_rand_aux;
     logic [Width/2-1:0] unused_rand;
     assign unused_clk = clk_i;
     assign unused_rst_n = rst_ni;
     assign unused_phase_sel = phase_sel_i;
     assign unused_cycle = cycle_i;
     assign unused_rand_aux = rand_aux_i;
     assign unused_rand = rand_i;
   end

   //////////////////////////////////////////////////
   // Input/output type conversion and interfacing //
   //////////////////////////////////////////////////
   for (genvar i = 0 ; i < Share ; i++) begin : g_state_inout
     assign state_in[i] = bitarray_to_box(s_i[i]);
     assign s_o[i]      = box_to_bitarray(state_out[i]);
   end : g_state_inout

   if (EnMasking) begin : g_2share_data
     assign phase1_in = state_in;
     assign phase2_in = state_in;

     always_comb begin
       unique case (phase_sel_i)
         MuBi4False: state_out = phase1_out;
         MuBi4True:  state_out = phase2_out;
         default:    state_out = phase1_out;
       endcase
     end
   end else begin : g_single_data
     assign phase1_in = state_in;
     assign phase2_in = phase1_out;
     assign state_out = phase2_out;
   end

   //////////////
   // Datapath //
   //////////////
   // This module has two phases. First phase, it calculates Theta, Rho, Pi steps
   // in SHA3. At the second phase, it computes Chi and Iota steps. If masking is
   // not enabled, the two phases are completed within a single clock cycle.
   //
   // If masking is enabled, the first phase (Phase1) completes in one cycle.
   // Then, the output should be stored in the state and given to the input of
   // this module again. The second phase in the masked version needs three
   // clock cycles to complete. In the first clock cycle, the first stage of Chi
   // is computed for the first lane halves. In the second clock cycle, the
   // module then outputs the updated first lane halves. In the third clock
   // cycle, the new second lane halves are output. To aggravate SCA, we
   // randomly decide which lane halves to process first on a per-round basis.
   // We use additional randomness generated by the PRNG to take this decision
   // (rand_aux_i). For more details, refer to the comments in the "MUX control"
   // section below.

   for (genvar i = 0 ; i < Share ; i++) begin : g_datapath

     // Phase 1:
     assign theta_data[i] = theta(phase1_in[i]);
     // Commented out rho function as vcs complains z-Offset%W isn't constant
     // assign rho_data[i]   = rho(theta_data[i]);

     assign pi_data[i]    = pi(rho_data[i]);

     // Phase 2 (Cycles 1, 2 and 3):
     // Chi : See below
     // Iota: See below
   end : g_datapath

   assign phase1_out = pi_data;

   // Iota adds Round Constants(RC), so only one share should be XORed
   if (EnMasking) begin : g_2share_iota
     assign iota_data[0]  = iota(chi_data[0], rnd_i);
     assign iota_data[1]  = chi_data[1];
   end else begin : g_single_iota
     assign iota_data[0]  = iota(chi_data[0], rnd_i);
   end

   if (EnMasking) begin : g_2share_chi
     // Domain-Oriented Masking
     // reference: https://eprint.iacr.org/2017/395.pdf

     localparam int unsigned WSheetHalf = $bits(sheet_t)/2;
     logic [4:0][WSheetHalf-1:0] in_prd, out_prd;

     logic low_then_high_d, low_then_high_q;
     logic in_data_low, out_data_low;
     logic in_rand_ext;
     logic update_dom;

     /////////////////
     // MUX control //
     /////////////////

     // Update lane-half processing order in Phase 1 and keep the value constant
     // for the entire round.
     assign low_then_high_d =
         mubi4_test_false_strict(phase_sel_i) ? rand_aux_i : low_then_high_q;

     always_ff @(posedge clk_i or negedge rst_ni) begin
       if (!rst_ni) begin
         low_then_high_q <= 1'b 0;
       end else begin
         low_then_high_q <= low_then_high_d;
       end
     end

     // This implementation uses both randomness provided from an external PRNG
     // as well as intermediate results for remasking the DOM multipliers below.
     // Per clock cycle, 800b of pseudo-random data (PRD) are required. The
     // following schedule is used to only ever update the input data when also
     // providing fresh randomness and vice versa.
     //
     // Cycle 0: Compute Theta, Rho, Pi - The DOM multipliers are not evaluated
     //          at all: the inputs are driven by the first lane halves (same
     //          values as in Cycle 3). Also the intermediate results we already
     //          had in Cycle 3 didn't change.
     // Cycle 1: Compute first stage of Chi for first lane halves using the DOM
     //          multipliers. We use the fresh randomness provided from the
     //          PRNG for remasking.
     // Cycle 2: Compute second stage of Chi and Iota for first lane halves.
     //          Compute first stage of Chi for second lane halves. We use the
     //          fresh randomness provided from the PRNG for remasking the
     //          DOM multipliers.
     // Cycle 3: Compute second stage of Chi and Iota for second lane halves.
     //          Feed again first lane halves to DOM multiplier inputs (now
     //          the updated values become visible) together with intermediate
     //          results of Cycle 2. Don't update the register stage inside
     //          the DOM multipliers.
     always_comb begin
       unique case (cycle_i)
         2'h0: begin
           in_data_low = low_then_high_q;
           in_rand_ext = 1'b0;
           update_dom  = 1'b0;
         end
         2'h1: begin
           in_data_low = low_then_high_q;
           in_rand_ext = 1'b1;
           update_dom  = 1'b1;
         end
         2'h2: begin
           in_data_low = ~low_then_high_q;
           in_rand_ext = 1'b1;
           update_dom  = 1'b1;
         end
         2'h3: begin
           in_data_low = low_then_high_q;
           in_rand_ext = 1'b0;
           update_dom  = 1'b0;
         end
         default: begin
           in_data_low = low_then_high_q;
           in_rand_ext = 1'b0;
           update_dom  = 1'b0;
         end
       endcase
     end

     // When taking the lower lane halves in, the upper lane halves are output
     // and vice versa.
     assign out_data_low = ~in_data_low;

     /////////////////////
     // DOM multipliers //
     /////////////////////

     for (genvar x = 0 ; x < 5 ; x++) begin : g_chi_w
       localparam int X1 = (x + 1) % 5;
       localparam int X2 = (x + 2) % 5;

       sheet_t sheet0[Share]; // Inverted input X1
       sheet_t sheet1[Share]; // X2
       sheet_t sheet2[Share]; // DOM output

       assign sheet0[0] = ~phase2_in[0][X1];
       assign sheet0[1] = phase2_in[1][X1];

       assign sheet1[0] = phase2_in[0][X2];
       assign sheet1[1] = phase2_in[1][X2];

       // Convert sheet_t to 1D arrays, one for the upper and lower half lane.
       logic [WSheetHalf-1:0] a0_l, a1_l, b0_l, b1_l;
       logic [WSheetHalf-1:0] a0_h, a1_h, b0_h, b1_h;
       logic [WSheetHalf-1:0] a0, a1, b0, b1, q0, q1;

       assign a0_l = {sheet0[0][0][W/2-1:0],
                      sheet0[0][1][W/2-1:0],
                      sheet0[0][2][W/2-1:0],
                      sheet0[0][3][W/2-1:0],
                      sheet0[0][4][W/2-1:0]};
       assign a1_l = {sheet0[1][0][W/2-1:0],
                      sheet0[1][1][W/2-1:0],
                      sheet0[1][2][W/2-1:0],
                      sheet0[1][3][W/2-1:0],
                      sheet0[1][4][W/2-1:0]};

       assign a0_h = {sheet0[0][0][W-1:W/2],
                      sheet0[0][1][W-1:W/2],
                      sheet0[0][2][W-1:W/2],
                      sheet0[0][3][W-1:W/2],
                      sheet0[0][4][W-1:W/2]};
       assign a1_h = {sheet0[1][0][W-1:W/2],
                      sheet0[1][1][W-1:W/2],
                      sheet0[1][2][W-1:W/2],
                      sheet0[1][3][W-1:W/2],
                      sheet0[1][4][W-1:W/2]};

       assign b0_l = {sheet1[0][0][W/2-1:0],
                      sheet1[0][1][W/2-1:0],
                      sheet1[0][2][W/2-1:0],
                      sheet1[0][3][W/2-1:0],
                      sheet1[0][4][W/2-1:0]};
       assign b1_l = {sheet1[1][0][W/2-1:0],
                      sheet1[1][1][W/2-1:0],
                      sheet1[1][2][W/2-1:0],
                      sheet1[1][3][W/2-1:0],
                      sheet1[1][4][W/2-1:0]};

       assign b0_h = {sheet1[0][0][W-1:W/2],
                      sheet1[0][1][W-1:W/2],
                      sheet1[0][2][W-1:W/2],
                      sheet1[0][3][W-1:W/2],
                      sheet1[0][4][W-1:W/2]};
       assign b1_h = {sheet1[1][0][W-1:W/2],
                      sheet1[1][1][W-1:W/2],
                      sheet1[1][2][W-1:W/2],
                      sheet1[1][3][W-1:W/2],
                      sheet1[1][4][W-1:W/2]};

       // Input muxing
       assign a0 = in_data_low ? a0_l : a0_h;
       assign a1 = in_data_low ? a1_l : a1_h;
       assign b0 = in_data_low ? b0_l : b0_h;
       assign b1 = in_data_low ? b1_l : b1_h;

       // Randomness muxing
       // Intermediate results are rotated across rows. The new Row x depends on
       // data from Rows x + 1 and x + 2. Hence we don't want to use intermediate
       // results from Rows x, x + 1, and x + 2 for remasking.
       assign in_prd[x] = in_rand_ext ? rand_i[x * WSheetHalf +: WSheetHalf] :
                                        out_prd[rot_int(x, 5)];

       prim_dom_and_2share #(
         .DW (WSheetHalf), // a half sheet
         .Pipeline(1) // Process the full sheet in 3 clock cycles. This reduces
                      // SCA leakage.
       ) u_dom (
         .clk_i,
         .rst_ni,

         .a0_i      (a0),
         .a1_i      (a1),
         .b0_i      (b0),
         .b1_i      (b1),
         .z_valid_i (update_dom),
         .z_i       (in_prd[x]),
         .q0_o      (q0),
         .q1_o      (q1),
         .prd_o     (out_prd[x])
       );

       // Output conversion from q0, q1 to sheet_t
       // For simplicity, we forward the generated lane half to both the upper
       // and lower lane halves at this point. The actual output muxing/selection
       // happens after the Iota step when generating phase2_out from iota_data
       // and state_in below.
       assign sheet2[0][4] = {2{q0[W/2*0+:W/2]}};
       assign sheet2[0][3] = {2{q0[W/2*1+:W/2]}};
       assign sheet2[0][2] = {2{q0[W/2*2+:W/2]}};
       assign sheet2[0][1] = {2{q0[W/2*3+:W/2]}};
       assign sheet2[0][0] = {2{q0[W/2*4+:W/2]}};

       assign sheet2[1][4] = {2{q1[W/2*0+:W/2]}};
       assign sheet2[1][3] = {2{q1[W/2*1+:W/2]}};
       assign sheet2[1][2] = {2{q1[W/2*2+:W/2]}};
       assign sheet2[1][1] = {2{q1[W/2*3+:W/2]}};
       assign sheet2[1][0] = {2{q1[W/2*4+:W/2]}};

       // Final XOR to generate the output
       assign chi_data[0][x] = sheet2[0] ^ phase2_in[0][x];
       assign chi_data[1][x] = sheet2[1] ^ phase2_in[1][x];
     end : g_chi_w

     // Since Chi and thus Iota are separately applied to the lower and upper half
     // lanes, we need to forward the input to the other half.
     for (genvar x = 0 ; x < 5 ; x++) begin : g_2share_phase2_out_row
       for (genvar y = 0 ; y < 5 ; y++) begin : g_2share_phase2_out_col
         assign phase2_out[0][x][y] = out_data_low ?
             { state_in[0][x][y][W-1:W/2], iota_data[0][x][y][W/2-1:0]} :
             {iota_data[0][x][y][W-1:W/2],  state_in[0][x][y][W/2-1:0]};
         assign phase2_out[1][x][y] = out_data_low ?
             { state_in[1][x][y][W-1:W/2], iota_data[1][x][y][W/2-1:0]} :
             {iota_data[1][x][y][W-1:W/2],  state_in[1][x][y][W/2-1:0]};
       end
     end

   end else begin : g_single_chi
     assign chi_data[0] = chi(phase2_in[0]);
     assign phase2_out = iota_data;
   end

   // Rho ======================================================================
   // As RhoOffset[x][y] is considered as variable int in VCS,
   // it is replaced with generate statement.
   // Revised to meet verilator lint. Now RhoOffset is 1-D array
   localparam int RhoOffset [25]  = '{
     //y  0    1    2    3    4     x
          0,  36,   3, 105, 210, // 0:  0  1  2  3  4
          1, 300,  10,  45,  66, // 1:  5  6  7  8  9
        190,   6, 171,  15, 253, // 2: 10 11 12 13 14
         28,  55, 153,  21, 120, // 3: 15 16 17 18 19
         91, 276, 231, 136,  78  // 4: 20 21 22 23 24
   };
   for (genvar i = 0 ; i < Share ; i++) begin : g_rho
     box_t rho_in, rho_out;
     assign rho_in = theta_data[i];
     assign rho_data[i] = rho_out;

     for (genvar x = 0 ; x < 5 ; x++) begin : gen_rho_x
       for (genvar y = 0 ; y < 5 ; y++) begin : gen_rho_y
         localparam int Offset = RhoOffset[5*x+y]%W;
         localparam int ShiftAmt = W- Offset;
         if (Offset == 0) begin : gen_offset0
           assign rho_out[x][y][W-1:0] = rho_in[x][y][W-1:0];
         end else begin : gen_others
           assign rho_out[x][y][W-1:0] = {rho_in[x][y][0+:ShiftAmt],
                                          rho_in[x][y][ShiftAmt+:Offset]};
         end
       end
     end
   end : g_rho

   ////////////////
   // Assertions //
   ////////////////

   `ASSERT_INIT(ValidWidth_A,
       EnMasking == 0 && Width inside {25, 50, 100, 200, 400, 800, 1600} ||
       EnMasking == 1 && Width inside {50, 100, 200, 400, 800, 1600})
   `ASSERT_INIT(ValidW_A, W inside {1, 2, 4, 8, 16, 32, 64})
   `ASSERT_INIT(ValidL_A, L inside {0, 1, 2, 3, 4, 5, 6})
   `ASSERT_INIT(ValidRound_A, MaxRound <= 24) // Keccak-f only

   // phase_sel_i shall stay for two cycle after change to 1.
   lc_ctrl_pkg::lc_tx_t unused_lc_sig;
   assign unused_lc_sig = lc_escalate_en_i;
   if (EnMasking) begin : gen_selperiod_chk
     `ASSUME(SelStayTwoCycleIfTrue_A,
         ($past(phase_sel_i) == MuBi4False) && (phase_sel_i == MuBi4True)
         |=> phase_sel_i == MuBi4True, clk_i, !rst_ni || lc_escalate_en_i != lc_ctrl_pkg::Off)
   end

   ///////////////
   // Functions //
   ///////////////

   // Convert bitarray to 3D box
   // Please take a look at FIPS PUB 202
   // https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
   // > For all triples (x,y,z) such that 0<=x<5, 0<=y<5, and 0<=z<w,
   // >    A[x,y,z]=S[w(5y+x)+z]
   function automatic box_t bitarray_to_box(logic [Width-1:0] s_in);
     automatic box_t box;
     for (int y = 0 ; y < 5 ; y++) begin
       for (int x = 0 ; x < 5 ; x++) begin
         for (int z = 0 ; z < W ; z++) begin
           box[x][y][z] = s_in[W*(5*y+x) + z];
         end
       end
     end
     return box;
   endfunction : bitarray_to_box

   // Convert 3D cube to bitarray
   function automatic logic [Width-1:0] box_to_bitarray(box_t state);
     automatic logic [Width-1:0] bitarray;
     for (int y = 0 ; y < 5 ; y++) begin
       for (int x = 0 ; x < 5 ; x++) begin
         for (int z = 0 ; z < W ; z++) begin
           bitarray[W*(5*y+x)+z] = state[x][y][z];
         end
       end
     end
     return bitarray;
   endfunction : box_to_bitarray

   // Rotate integer indices
   function automatic integer rot_int(integer in, integer num);
     integer out;
     if (in == 0) begin
       out = num - 1;
     end else begin
       out = in - 1;
     end
     return out;
   endfunction

   // Step Mapping =============================================================
   // theta
   // XOR each bit in the state with the parity of two columns
   // C[x,z] = A[x,0,z] ^ A[x,1,z] ^ A[x,2,z] ^ A[x,3,z] ^ A[x,4,z]
   // D[x,z] = C[x-1,z] ^ C[x+1,z-1]
   // theta = A[x,y,z] ^ D[x,z]
   parameter int ThetaIndexX1 [5] = '{4, 0, 1, 2, 3}; // (x-1)%5
   parameter int ThetaIndexX2 [5] = '{1, 2, 3, 4, 0}; // (x+1)%5
   function automatic box_t theta(box_t state);
     plane_t c;
     plane_t d;
     box_t result;
     for (int x = 0 ; x < 5 ; x++) begin
       c[x] = state[x][0] ^ state[x][1] ^ state[x][2] ^ state[x][3] ^ state[x][4];
     end
     for (int x = 0 ; x < 5 ; x++) begin
       for (int z = 0 ; z < W ; z++) begin
         int index_z;
         index_z = (z == 0) ? W-1 : z-1; // (z+1)%W
         d[x][z] = c[ThetaIndexX1[x]][z] ^ c[ThetaIndexX2[x]][index_z];
       end
     end
     for (int x = 0 ; x < 5 ; x++) begin
       for (int y = 0 ; y < 5 ; y++) begin
         result[x][y] = state[x][y] ^ d[x];
       end
     end
     return result;
   endfunction : theta

   // rho

   // Commented out entire rho function due to VCS elaboration error.
   // (z-RhoOffset[x][y]%W) isn't considered as a constant in VCS.
   // Even changing it to W-RhoOffset[x][y]%W and assign to ShiftAmt
   // creates same error.

   // Offset : Look at Table 2 in FIPS PUB 202
   //localparam int RhoOffset [5][5]  = '{
   //  //y  0    1    2    3    4     x
   //  '{   0,  36,   3, 105, 210},// 0
   //  '{   1, 300,  10,  45,  66},// 1
   //  '{ 190,   6, 171,  15, 253},// 2
   //  '{  28,  55, 153,  21, 120},// 3
   //  '{  91, 276, 231, 136,  78} // 4
   //};

   // rotate bits of each lane by offset
   // 1. rho[0,0,z] = A[0,0,z]
   // 2. Offset swap
   //    a. (x,y) := (1,0)
   //    b. for t [0..23]
   //       i. rho[x,y,z] = A[x,y,z-(t+1)(t+2)/2]
   //       ii. (x,y) = (y, (2x+3y))
   //function automatic box_t rho(box_t state);
   //  box_t result;
   //  for (int x = 0 ; x < 5 ; x++) begin
   //    for (int y = 0 ; y < 5 ; y++) begin
   //      for (int z = 0 ; z < W ; z++) begin
   //        automatic int index_z;
   //        index_z = (z-RhoOffset[x][y])%W;
   //        result[x][y][z] = state[x][y][(z-RhoOffset[x][y])%W];
   //      end
   //    end
   //  end
   //  return result;
   //endfunction : rho

   // pi
   // rearrange the position of lanes
   // pi[x,y,z] = state[(x+3y),x,z]
   localparam int PiRotate [5][5] = '{
     //y  0    1    2    3    4     x
     '{   0,   3,   1,   4,   2},// 0
     '{   1,   4,   2,   0,   3},// 1
     '{   2,   0,   3,   1,   4},// 2
     '{   3,   1,   4,   2,   0},// 3
     '{   4,   2,   0,   3,   1} // 4
   };
   function automatic box_t pi(box_t state);
     box_t result;
     for (int x = 0 ; x < 5 ; x++) begin
       for (int y = 0 ; y < 5 ; y++) begin
         result[x][y][W-1:0] = state[PiRotate[x][y]][x][W-1:0];
       end
     end
     return result;
   endfunction : pi

   // chi
   // chi[x,y,z] = state[x,y,z] ^ ((state[x+1,y,z] ^ 1) & state[x+2,y,z])
   parameter int ChiIndexX1 [5] = '{1, 2, 3, 4, 0}; // (x+1)%5
   parameter int ChiIndexX2 [5] = '{2, 3, 4, 0, 1}; // (x+2)%5
   function automatic box_t chi(box_t state);
     box_t result;
     for (int x = 0 ; x < 5 ; x++) begin
       result[x] = state[x] ^ ((~state[ChiIndexX1[x]]) & state[ChiIndexX2[x]]);
     end
     return result;
   endfunction : chi

   // iota
   // XOR (x,y) = (0,0) with Round Constant (RC)

   // RC parameter: Precomputed by util/keccak_rc.py. Only up-to 0..L-1 is used
   // RC = '0
   // RC[2**j-1] = rc(j+7*rnd)
   // rc(t) =
   //    1. t%255 == 0 -> 1
   //    2. R[0:7] = 'b10000000
   //    3. for i = [1..t%255]
   //      a. R = 0 || R
   //      b. R[0] = R[0] ^ R[8]
   //      c. R[4] = R[4] ^ R[8]
   //      d. R[5] = R[5] ^ R[8]
   //      e. R[6] = R[6] ^ R[8]
   //      f. R = R[0:7]
   //    4. return R[0]
   // RC has L = [0..6]
   // for lower L case, only chopping lower part of 64bit RC is sufficient.
   localparam logic [63:0] RC [24] = '{
      64'h 0000_0000_0000_0001, // Round 0
      64'h 0000_0000_0000_8082, // Round 1
      64'h 8000_0000_0000_808A, // Round 2
      64'h 8000_0000_8000_8000, // Round 3
      64'h 0000_0000_0000_808B, // Round 4
      64'h 0000_0000_8000_0001, // Round 5
      64'h 8000_0000_8000_8081, // Round 6
      64'h 8000_0000_0000_8009, // Round 7
      64'h 0000_0000_0000_008A, // Round 8
      64'h 0000_0000_0000_0088, // Round 9
      64'h 0000_0000_8000_8009, // Round 10
      64'h 0000_0000_8000_000A, // Round 11
      64'h 0000_0000_8000_808B, // Round 12
      64'h 8000_0000_0000_008B, // Round 13
      64'h 8000_0000_0000_8089, // Round 14
      64'h 8000_0000_0000_8003, // Round 15
      64'h 8000_0000_0000_8002, // Round 16
      64'h 8000_0000_0000_0080, // Round 17
      64'h 0000_0000_0000_800A, // Round 18
      64'h 8000_0000_8000_000A, // Round 19
      64'h 8000_0000_8000_8081, // Round 20
      64'h 8000_0000_0000_8080, // Round 21
      64'h 0000_0000_8000_0001, // Round 22
      64'h 8000_0000_8000_8008  // Round 23
   };

   // iota: XOR with RC for (x,y) = (0,0)
   function automatic box_t iota(box_t state, logic [RndW-1:0] rnd);
     box_t result;
     result = state;
     result[0][0][W-1:0] = state[0][0][W-1:0] ^ RC[rnd][W-1:0];

     return result;
   endfunction : iota

   // Round function : Rnd(A,i_r)
   // Not used due to rho function issue described above.

   //function automatic box_t keccak_rnd(box_t state, logic [RndW-1:0] rnd);
   //  box_t keccak_state;
   //  keccak_state = iota(chi(pi(rho(theta(state)))), rnd);
   //
   //  return keccak_state;
   //endfunction : keccak_rnd

 endmodule
	// Copyright lowRISC contributors.
	// Licensed under the Apache License, Version 2.0, see LICENSE for details.
	// SPDX-License-Identifier: Apache-2.0
	//
	// This module is the single round keccak permutation module
	// It supports Keccak with up to 1600b of state

	`include "prim_assert.sv"

	module keccak_2share
	import prim_mubi_pkg::*;
	#(
	parameter int Width = 1600, // b= {25, 50, 100, 200, 400, 800, 1600}

	// Derived
	localparam int W = Width/25,
	localparam int L = $clog2(W),
	localparam int MaxRound = 12 + 2*L, // Keccak-f only
	localparam int RndW = $clog2(MaxRound+1), // Representing up to MaxRound

	// Control parameters
	parameter bit EnMasking = 0, // Enable secure hardening
	localparam int Share = EnMasking ? 2 : 1
	) (
	input clk_i,
	input rst_ni,

	input lc_ctrl_pkg::lc_tx_t lc_escalate_en_i, // Used to disable SVAs when escalating.

	input [RndW-1:0] rnd_i, // Current round index
	input mubi4_t phase_sel_i, // Output mux contol. Used when EnMasking := 1
	input [1:0] cycle_i, // Current cycle index. Used when EnMasking := 1
	input rand_aux_i, // Auxiliary randomness input. Used when EnMasking := 1
	input [Width/2-1:0] rand_i, // Randomness for remasking. Used when EnMasking := 1
	input [Width-1:0] s_i [Share],
	output logic [Width-1:0] s_o [Share]
	);
	///////////
	// Types //
	///////////
	// x y z
	typedef logic [4:0][4:0][W-1:0] box_t; // (x,y,z) state
	typedef logic [W-1:0] lane_t; // (z)
	typedef logic [4:0] [W-1:0] plane_t; // (x,z)
	typedef logic [4:0][4:0] slice_t; // (x,y)
	typedef logic [4:0][W-1:0] sheet_t; // (y,z) identical to plane_t
	typedef logic [4:0] row_t; // (x)
	typedef logic [4:0] col_t; // (y) identical to row_t

	//////////////
	// Keccak_f //
	//////////////
	box_t state_in [Share];
	box_t state_out [Share];
	box_t theta_data [Share];
	box_t rho_data [Share];
	box_t pi_data [Share];
	box_t chi_data [Share];
	box_t iota_data [Share];

	box_t phase1_in [Share];
	box_t phase1_out [Share];
	box_t phase2_in [Share];
	box_t phase2_out [Share];

	/////////////////
	// Unused nets //
	/////////////////
	// Tie off input signals that aren't used in the unmasked implementation.
	if (!EnMasking) begin : gen_tie_unused
	logic unused_clk;
	logic unused_rst_n;
	mubi4_t unused_phase_sel;
	logic [1:0] unused_cycle;
	logic unused_rand_aux;
	logic [Width/2-1:0] unused_rand;
	assign unused_clk = clk_i;
	assign unused_rst_n = rst_ni;
	assign unused_phase_sel = phase_sel_i;
	assign unused_cycle = cycle_i;
	assign unused_rand_aux = rand_aux_i;
	assign unused_rand = rand_i;
	end

	//////////////////////////////////////////////////
	// Input/output type conversion and interfacing //
	//////////////////////////////////////////////////
	for (genvar i = 0 ; i < Share ; i++) begin : g_state_inout
	assign state_in[i] = bitarray_to_box(s_i[i]);
	assign s_o[i] = box_to_bitarray(state_out[i]);
	end : g_state_inout

	if (EnMasking) begin : g_2share_data
	assign phase1_in = state_in;
	assign phase2_in = state_in;

	always_comb begin
	unique case (phase_sel_i)
	MuBi4False: state_out = phase1_out;
	MuBi4True: state_out = phase2_out;
	default: state_out = phase1_out;
	endcase
	end
	end else begin : g_single_data
	assign phase1_in = state_in;
	assign phase2_in = phase1_out;
	assign state_out = phase2_out;
	end

	//////////////
	// Datapath //
	//////////////
	// This module has two phases. First phase, it calculates Theta, Rho, Pi steps
	// in SHA3. At the second phase, it computes Chi and Iota steps. If masking is
	// not enabled, the two phases are completed within a single clock cycle.
	//
	// If masking is enabled, the first phase (Phase1) completes in one cycle.
	// Then, the output should be stored in the state and given to the input of
	// this module again. The second phase in the masked version needs three
	// clock cycles to complete. In the first clock cycle, the first stage of Chi
	// is computed for the first lane halves. In the second clock cycle, the
	// module then outputs the updated first lane halves. In the third clock
	// cycle, the new second lane halves are output. To aggravate SCA, we
	// randomly decide which lane halves to process first on a per-round basis.
	// We use additional randomness generated by the PRNG to take this decision
	// (rand_aux_i). For more details, refer to the comments in the "MUX control"
	// section below.

	for (genvar i = 0 ; i < Share ; i++) begin : g_datapath

	// Phase 1:
	assign theta_data[i] = theta(phase1_in[i]);
	// Commented out rho function as vcs complains z-Offset%W isn't constant
	// assign rho_data[i] = rho(theta_data[i]);

	assign pi_data[i] = pi(rho_data[i]);

	// Phase 2 (Cycles 1, 2 and 3):
	// Chi : See below
	// Iota: See below
	end : g_datapath

	assign phase1_out = pi_data;

	// Iota adds Round Constants(RC), so only one share should be XORed
	if (EnMasking) begin : g_2share_iota
	assign iota_data[0] = iota(chi_data[0], rnd_i);
	assign iota_data[1] = chi_data[1];
	end else begin : g_single_iota
	assign iota_data[0] = iota(chi_data[0], rnd_i);
	end

	if (EnMasking) begin : g_2share_chi
	// Domain-Oriented Masking
	// reference: https://eprint.iacr.org/2017/395.pdf

	localparam int unsigned WSheetHalf = $bits(sheet_t)/2;
	logic [4:0][WSheetHalf-1:0] in_prd, out_prd;

	logic low_then_high_d, low_then_high_q;
	logic in_data_low, out_data_low;
	logic in_rand_ext;
	logic update_dom;

	/////////////////
	// MUX control //
	/////////////////

	// Update lane-half processing order in Phase 1 and keep the value constant
	// for the entire round.
	assign low_then_high_d =
	mubi4_test_false_strict(phase_sel_i) ? rand_aux_i : low_then_high_q;

	always_ff @(posedge clk_i or negedge rst_ni) begin
	if (!rst_ni) begin
	low_then_high_q <= 1'b 0;
	end else begin
	low_then_high_q <= low_then_high_d;
	end
	end

	// This implementation uses both randomness provided from an external PRNG
	// as well as intermediate results for remasking the DOM multipliers below.
	// Per clock cycle, 800b of pseudo-random data (PRD) are required. The
	// following schedule is used to only ever update the input data when also
	// providing fresh randomness and vice versa.
	//
	// Cycle 0: Compute Theta, Rho, Pi - The DOM multipliers are not evaluated
	// at all: the inputs are driven by the first lane halves (same
	// values as in Cycle 3). Also the intermediate results we already
	// had in Cycle 3 didn't change.
	// Cycle 1: Compute first stage of Chi for first lane halves using the DOM
	// multipliers. We use the fresh randomness provided from the
	// PRNG for remasking.
	// Cycle 2: Compute second stage of Chi and Iota for first lane halves.
	// Compute first stage of Chi for second lane halves. We use the
	// fresh randomness provided from the PRNG for remasking the
	// DOM multipliers.
	// Cycle 3: Compute second stage of Chi and Iota for second lane halves.
	// Feed again first lane halves to DOM multiplier inputs (now
	// the updated values become visible) together with intermediate
	// results of Cycle 2. Don't update the register stage inside
	// the DOM multipliers.
	always_comb begin
	unique case (cycle_i)
	2'h0: begin
	in_data_low = low_then_high_q;
	in_rand_ext = 1'b0;
	update_dom = 1'b0;
	end
	2'h1: begin
	in_data_low = low_then_high_q;
	in_rand_ext = 1'b1;
	update_dom = 1'b1;
	end
	2'h2: begin
	in_data_low = ~low_then_high_q;
	in_rand_ext = 1'b1;
	update_dom = 1'b1;
	end
	2'h3: begin
	in_data_low = low_then_high_q;
	in_rand_ext = 1'b0;
	update_dom = 1'b0;
	end
	default: begin
	in_data_low = low_then_high_q;
	in_rand_ext = 1'b0;
	update_dom = 1'b0;
	end
	endcase
	end

	// When taking the lower lane halves in, the upper lane halves are output
	// and vice versa.
	assign out_data_low = ~in_data_low;

	/////////////////////
	// DOM multipliers //
	/////////////////////

	for (genvar x = 0 ; x < 5 ; x++) begin : g_chi_w
	localparam int X1 = (x + 1) % 5;
	localparam int X2 = (x + 2) % 5;

	sheet_t sheet0[Share]; // Inverted input X1
	sheet_t sheet1[Share]; // X2
	sheet_t sheet2[Share]; // DOM output

	assign sheet0[0] = ~phase2_in[0][X1];
	assign sheet0[1] = phase2_in[1][X1];

	assign sheet1[0] = phase2_in[0][X2];
	assign sheet1[1] = phase2_in[1][X2];

	// Convert sheet_t to 1D arrays, one for the upper and lower half lane.
	logic [WSheetHalf-1:0] a0_l, a1_l, b0_l, b1_l;
	logic [WSheetHalf-1:0] a0_h, a1_h, b0_h, b1_h;
	logic [WSheetHalf-1:0] a0, a1, b0, b1, q0, q1;

	assign a0_l = {sheet0[0][0][W/2-1:0],
	sheet0[0][1][W/2-1:0],
	sheet0[0][2][W/2-1:0],
	sheet0[0][3][W/2-1:0],
	sheet0[0][4][W/2-1:0]};
	assign a1_l = {sheet0[1][0][W/2-1:0],
	sheet0[1][1][W/2-1:0],
	sheet0[1][2][W/2-1:0],
	sheet0[1][3][W/2-1:0],
	sheet0[1][4][W/2-1:0]};

	assign a0_h = {sheet0[0][0][W-1:W/2],
	sheet0[0][1][W-1:W/2],
	sheet0[0][2][W-1:W/2],
	sheet0[0][3][W-1:W/2],
	sheet0[0][4][W-1:W/2]};
	assign a1_h = {sheet0[1][0][W-1:W/2],
	sheet0[1][1][W-1:W/2],
	sheet0[1][2][W-1:W/2],
	sheet0[1][3][W-1:W/2],
	sheet0[1][4][W-1:W/2]};

	assign b0_l = {sheet1[0][0][W/2-1:0],
	sheet1[0][1][W/2-1:0],
	sheet1[0][2][W/2-1:0],
	sheet1[0][3][W/2-1:0],
	sheet1[0][4][W/2-1:0]};
	assign b1_l = {sheet1[1][0][W/2-1:0],
	sheet1[1][1][W/2-1:0],
	sheet1[1][2][W/2-1:0],
	sheet1[1][3][W/2-1:0],
	sheet1[1][4][W/2-1:0]};

	assign b0_h = {sheet1[0][0][W-1:W/2],
	sheet1[0][1][W-1:W/2],
	sheet1[0][2][W-1:W/2],
	sheet1[0][3][W-1:W/2],
	sheet1[0][4][W-1:W/2]};
	assign b1_h = {sheet1[1][0][W-1:W/2],
	sheet1[1][1][W-1:W/2],
	sheet1[1][2][W-1:W/2],
	sheet1[1][3][W-1:W/2],
	sheet1[1][4][W-1:W/2]};

	// Input muxing
	assign a0 = in_data_low ? a0_l : a0_h;
	assign a1 = in_data_low ? a1_l : a1_h;
	assign b0 = in_data_low ? b0_l : b0_h;
	assign b1 = in_data_low ? b1_l : b1_h;

	// Randomness muxing
	// Intermediate results are rotated across rows. The new Row x depends on
	// data from Rows x + 1 and x + 2. Hence we don't want to use intermediate
	// results from Rows x, x + 1, and x + 2 for remasking.
	assign in_prd[x] = in_rand_ext ? rand_i[x * WSheetHalf +: WSheetHalf] :
	out_prd[rot_int(x, 5)];

	prim_dom_and_2share #(
	.DW (WSheetHalf), // a half sheet
	.Pipeline(1) // Process the full sheet in 3 clock cycles. This reduces
	// SCA leakage.
	) u_dom (
	.clk_i,
	.rst_ni,

	.a0_i (a0),
	.a1_i (a1),
	.b0_i (b0),
	.b1_i (b1),
	.z_valid_i (update_dom),
	.z_i (in_prd[x]),
	.q0_o (q0),
	.q1_o (q1),
	.prd_o (out_prd[x])
	);

	// Output conversion from q0, q1 to sheet_t
	// For simplicity, we forward the generated lane half to both the upper
	// and lower lane halves at this point. The actual output muxing/selection
	// happens after the Iota step when generating phase2_out from iota_data
	// and state_in below.
	assign sheet2[0][4] = {2{q0[W/2*0+:W/2]}};
	assign sheet2[0][3] = {2{q0[W/2*1+:W/2]}};
	assign sheet2[0][2] = {2{q0[W/2*2+:W/2]}};
	assign sheet2[0][1] = {2{q0[W/2*3+:W/2]}};
	assign sheet2[0][0] = {2{q0[W/2*4+:W/2]}};

	assign sheet2[1][4] = {2{q1[W/2*0+:W/2]}};
	assign sheet2[1][3] = {2{q1[W/2*1+:W/2]}};
	assign sheet2[1][2] = {2{q1[W/2*2+:W/2]}};
	assign sheet2[1][1] = {2{q1[W/2*3+:W/2]}};
	assign sheet2[1][0] = {2{q1[W/2*4+:W/2]}};

	// Final XOR to generate the output
	assign chi_data[0][x] = sheet2[0] ^ phase2_in[0][x];
	assign chi_data[1][x] = sheet2[1] ^ phase2_in[1][x];
	end : g_chi_w

	// Since Chi and thus Iota are separately applied to the lower and upper half
	// lanes, we need to forward the input to the other half.
	for (genvar x = 0 ; x < 5 ; x++) begin : g_2share_phase2_out_row
	for (genvar y = 0 ; y < 5 ; y++) begin : g_2share_phase2_out_col
	assign phase2_out[0][x][y] = out_data_low ?
	{ state_in[0][x][y][W-1:W/2], iota_data[0][x][y][W/2-1:0]} :
	{iota_data[0][x][y][W-1:W/2], state_in[0][x][y][W/2-1:0]};
	assign phase2_out[1][x][y] = out_data_low ?
	{ state_in[1][x][y][W-1:W/2], iota_data[1][x][y][W/2-1:0]} :
	{iota_data[1][x][y][W-1:W/2], state_in[1][x][y][W/2-1:0]};
	end
	end

	end else begin : g_single_chi
	assign chi_data[0] = chi(phase2_in[0]);
	assign phase2_out = iota_data;
	end

	// Rho ======================================================================
	// As RhoOffset[x][y] is considered as variable int in VCS,
	// it is replaced with generate statement.
	// Revised to meet verilator lint. Now RhoOffset is 1-D array
	localparam int RhoOffset [25] = '{
	//y 0 1 2 3 4 x
	0, 36, 3, 105, 210, // 0: 0 1 2 3 4
	1, 300, 10, 45, 66, // 1: 5 6 7 8 9
	190, 6, 171, 15, 253, // 2: 10 11 12 13 14
	28, 55, 153, 21, 120, // 3: 15 16 17 18 19
	91, 276, 231, 136, 78 // 4: 20 21 22 23 24
	};
	for (genvar i = 0 ; i < Share ; i++) begin : g_rho
	box_t rho_in, rho_out;
	assign rho_in = theta_data[i];
	assign rho_data[i] = rho_out;

	for (genvar x = 0 ; x < 5 ; x++) begin : gen_rho_x
	for (genvar y = 0 ; y < 5 ; y++) begin : gen_rho_y
	localparam int Offset = RhoOffset[5*x+y]%W;
	localparam int ShiftAmt = W- Offset;
	if (Offset == 0) begin : gen_offset0
	assign rho_out[x][y][W-1:0] = rho_in[x][y][W-1:0];
	end else begin : gen_others
	assign rho_out[x][y][W-1:0] = {rho_in[x][y][0+:ShiftAmt],
	rho_in[x][y][ShiftAmt+:Offset]};
	end
	end
	end
	end : g_rho

	////////////////
	// Assertions //
	////////////////

	`ASSERT_INIT(ValidWidth_A,
	EnMasking == 0 && Width inside {25, 50, 100, 200, 400, 800, 1600} \|\|
	EnMasking == 1 && Width inside {50, 100, 200, 400, 800, 1600})
	`ASSERT_INIT(ValidW_A, W inside {1, 2, 4, 8, 16, 32, 64})
	`ASSERT_INIT(ValidL_A, L inside {0, 1, 2, 3, 4, 5, 6})
	`ASSERT_INIT(ValidRound_A, MaxRound <= 24) // Keccak-f only

	// phase_sel_i shall stay for two cycle after change to 1.
	lc_ctrl_pkg::lc_tx_t unused_lc_sig;
	assign unused_lc_sig = lc_escalate_en_i;
	if (EnMasking) begin : gen_selperiod_chk
	`ASSUME(SelStayTwoCycleIfTrue_A,
	($past(phase_sel_i) == MuBi4False) && (phase_sel_i == MuBi4True)
	\|=> phase_sel_i == MuBi4True, clk_i, !rst_ni \|\| lc_escalate_en_i != lc_ctrl_pkg::Off)
	end

	///////////////
	// Functions //
	///////////////

	// Convert bitarray to 3D box
	// Please take a look at FIPS PUB 202
	// https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
	// > For all triples (x,y,z) such that 0<=x<5, 0<=y<5, and 0<=z<w,
	// > A[x,y,z]=S[w(5y+x)+z]
	function automatic box_t bitarray_to_box(logic [Width-1:0] s_in);
	automatic box_t box;
	for (int y = 0 ; y < 5 ; y++) begin
	for (int x = 0 ; x < 5 ; x++) begin
	for (int z = 0 ; z < W ; z++) begin
	box[x][y][z] = s_in[W(5y+x) + z];
	end
	end
	end
	return box;
	endfunction : bitarray_to_box

	// Convert 3D cube to bitarray
	function automatic logic [Width-1:0] box_to_bitarray(box_t state);
	automatic logic [Width-1:0] bitarray;
	for (int y = 0 ; y < 5 ; y++) begin
	for (int x = 0 ; x < 5 ; x++) begin
	for (int z = 0 ; z < W ; z++) begin
	bitarray[W(5y+x)+z] = state[x][y][z];
	end
	end
	end
	return bitarray;
	endfunction : box_to_bitarray

	// Rotate integer indices
	function automatic integer rot_int(integer in, integer num);
	integer out;
	if (in == 0) begin
	out = num - 1;
	end else begin
	out = in - 1;
	end
	return out;
	endfunction

	// Step Mapping =============================================================
	// theta
	// XOR each bit in the state with the parity of two columns
	// C[x,z] = A[x,0,z] ^ A[x,1,z] ^ A[x,2,z] ^ A[x,3,z] ^ A[x,4,z]
	// D[x,z] = C[x-1,z] ^ C[x+1,z-1]
	// theta = A[x,y,z] ^ D[x,z]
	parameter int ThetaIndexX1 [5] = '{4, 0, 1, 2, 3}; // (x-1)%5
	parameter int ThetaIndexX2 [5] = '{1, 2, 3, 4, 0}; // (x+1)%5
	function automatic box_t theta(box_t state);
	plane_t c;
	plane_t d;
	box_t result;
	for (int x = 0 ; x < 5 ; x++) begin
	c[x] = state[x][0] ^ state[x][1] ^ state[x][2] ^ state[x][3] ^ state[x][4];
	end
	for (int x = 0 ; x < 5 ; x++) begin
	for (int z = 0 ; z < W ; z++) begin
	int index_z;
	index_z = (z == 0) ? W-1 : z-1; // (z+1)%W
	d[x][z] = c[ThetaIndexX1[x]][z] ^ c[ThetaIndexX2[x]][index_z];
	end
	end
	for (int x = 0 ; x < 5 ; x++) begin
	for (int y = 0 ; y < 5 ; y++) begin
	result[x][y] = state[x][y] ^ d[x];
	end
	end
	return result;
	endfunction : theta

	// rho

	// Commented out entire rho function due to VCS elaboration error.
	// (z-RhoOffset[x][y]%W) isn't considered as a constant in VCS.
	// Even changing it to W-RhoOffset[x][y]%W and assign to ShiftAmt
	// creates same error.

	// Offset : Look at Table 2 in FIPS PUB 202
	//localparam int RhoOffset [5][5] = '{
	// //y 0 1 2 3 4 x
	// '{ 0, 36, 3, 105, 210},// 0
	// '{ 1, 300, 10, 45, 66},// 1
	// '{ 190, 6, 171, 15, 253},// 2
	// '{ 28, 55, 153, 21, 120},// 3
	// '{ 91, 276, 231, 136, 78} // 4
	//};

	// rotate bits of each lane by offset
	// 1. rho[0,0,z] = A[0,0,z]
	// 2. Offset swap
	// a. (x,y) := (1,0)
	// b. for t [0..23]
	// i. rho[x,y,z] = A[x,y,z-(t+1)(t+2)/2]
	// ii. (x,y) = (y, (2x+3y))
	//function automatic box_t rho(box_t state);
	// box_t result;
	// for (int x = 0 ; x < 5 ; x++) begin
	// for (int y = 0 ; y < 5 ; y++) begin
	// for (int z = 0 ; z < W ; z++) begin
	// automatic int index_z;
	// index_z = (z-RhoOffset[x][y])%W;
	// result[x][y][z] = state[x][y][(z-RhoOffset[x][y])%W];
	// end
	// end
	// end
	// return result;
	//endfunction : rho

	// pi
	// rearrange the position of lanes
	// pi[x,y,z] = state[(x+3y),x,z]
	localparam int PiRotate [5][5] = '{
	//y 0 1 2 3 4 x
	'{ 0, 3, 1, 4, 2},// 0
	'{ 1, 4, 2, 0, 3},// 1
	'{ 2, 0, 3, 1, 4},// 2
	'{ 3, 1, 4, 2, 0},// 3
	'{ 4, 2, 0, 3, 1} // 4
	};
	function automatic box_t pi(box_t state);
	box_t result;
	for (int x = 0 ; x < 5 ; x++) begin
	for (int y = 0 ; y < 5 ; y++) begin
	result[x][y][W-1:0] = state[PiRotate[x][y]][x][W-1:0];
	end
	end
	return result;
	endfunction : pi

	// chi
	// chi[x,y,z] = state[x,y,z] ^ ((state[x+1,y,z] ^ 1) & state[x+2,y,z])
	parameter int ChiIndexX1 [5] = '{1, 2, 3, 4, 0}; // (x+1)%5
	parameter int ChiIndexX2 [5] = '{2, 3, 4, 0, 1}; // (x+2)%5
	function automatic box_t chi(box_t state);
	box_t result;
	for (int x = 0 ; x < 5 ; x++) begin
	result[x] = state[x] ^ ((~state[ChiIndexX1[x]]) & state[ChiIndexX2[x]]);
	end
	return result;
	endfunction : chi

	// iota
	// XOR (x,y) = (0,0) with Round Constant (RC)

	// RC parameter: Precomputed by util/keccak_rc.py. Only up-to 0..L-1 is used
	// RC = '0
	// RC[2*j-1] = rc(j+7rnd)
	// rc(t) =
	// 1. t%255 == 0 -> 1
	// 2. R[0:7] = 'b10000000
	// 3. for i = [1..t%255]
	// a. R = 0 \|\| R
	// b. R[0] = R[0] ^ R[8]
	// c. R[4] = R[4] ^ R[8]
	// d. R[5] = R[5] ^ R[8]
	// e. R[6] = R[6] ^ R[8]
	// f. R = R[0:7]
	// 4. return R[0]
	// RC has L = [0..6]
	// for lower L case, only chopping lower part of 64bit RC is sufficient.
	localparam logic [63:0] RC [24] = '{
	64'h 0000_0000_0000_0001, // Round 0
	64'h 0000_0000_0000_8082, // Round 1
	64'h 8000_0000_0000_808A, // Round 2
	64'h 8000_0000_8000_8000, // Round 3
	64'h 0000_0000_0000_808B, // Round 4
	64'h 0000_0000_8000_0001, // Round 5
	64'h 8000_0000_8000_8081, // Round 6
	64'h 8000_0000_0000_8009, // Round 7
	64'h 0000_0000_0000_008A, // Round 8
	64'h 0000_0000_0000_0088, // Round 9
	64'h 0000_0000_8000_8009, // Round 10
	64'h 0000_0000_8000_000A, // Round 11
	64'h 0000_0000_8000_808B, // Round 12
	64'h 8000_0000_0000_008B, // Round 13
	64'h 8000_0000_0000_8089, // Round 14
	64'h 8000_0000_0000_8003, // Round 15
	64'h 8000_0000_0000_8002, // Round 16
	64'h 8000_0000_0000_0080, // Round 17
	64'h 0000_0000_0000_800A, // Round 18
	64'h 8000_0000_8000_000A, // Round 19
	64'h 8000_0000_8000_8081, // Round 20
	64'h 8000_0000_0000_8080, // Round 21
	64'h 0000_0000_8000_0001, // Round 22
	64'h 8000_0000_8000_8008 // Round 23
	};

	// iota: XOR with RC for (x,y) = (0,0)
	function automatic box_t iota(box_t state, logic [RndW-1:0] rnd);
	box_t result;
	result = state;
	result[0][0][W-1:0] = state[0][0][W-1:0] ^ RC[rnd][W-1:0];

	return result;
	endfunction : iota

	// Round function : Rnd(A,i_r)
	// Not used due to rho function issue described above.

	//function automatic box_t keccak_rnd(box_t state, logic [RndW-1:0] rnd);
	// box_t keccak_state;
	// keccak_state = iota(chi(pi(rho(theta(state)))), rnd);
	//
	// return keccak_state;
	//endfunction : keccak_rnd

	endmodule