[prim] Move prim_present and prim_prince constants into package

This moves the constants and subfunctions associated with prim_present,
prim_prince into a package dedicated for scrambler blocks. Other blocks
like the aes_prng or prim_gate_gen can then import subfunctions from
this shared package.

This commit also adds the inverse sbox and perm LUTs for PRESENT.

Signed-off-by: Michael Schaffner <msf@opentitan.org>
diff --git a/hw/ip/aes/rtl/aes_prng.sv b/hw/ip/aes/rtl/aes_prng.sv
index f0c7046..a69cfe2 100644
--- a/hw/ip/aes/rtl/aes_prng.sv
+++ b/hw/ip/aes/rtl/aes_prng.sv
@@ -7,7 +7,7 @@
 // This module uses an LFSR connected to a PRINCE S-Box to provide pseudo-random data to the AES
 // module primarily for clearing registers. The LFSR can be reseeded using an external interface.
 
-module aes_prng(
+module aes_prng (
   input  logic        clk_i,
   input  logic        rst_ni,
 
@@ -26,22 +26,6 @@
 
   localparam int unsigned DATA_WIDTH = 64;
 
-  // The S-Box of the PRINCE cipher is used to "scramble" the LFSR output.
-  localparam logic[15:0][3:0] PRINCE_SBOX_FWD = {4'h4, 4'hD, 4'h5, 4'hE,
-                                                 4'h0, 4'h8, 4'h7, 4'h6,
-                                                 4'h1, 4'h9, 4'hC, 4'hA,
-                                                 4'h2, 4'h3, 4'hF, 4'hB};
-
-  // "Scramble" with PRINCE cipher S-Box.
-  function automatic logic [63:0] aes_prng_scramble(logic [63:0] in);
-    logic [63:0] out;
-    // The PRINCE cipher S-Box operates on 4-bit nibbles.
-    for (int i=0; i<16; i++) begin
-      out[i*4 +: 4] = PRINCE_SBOX_FWD[in[i*4 +: 4]];
-    end
-    return out;
-  endfunction
-
   logic [DATA_WIDTH-1:0] lfsr_state;
   logic                  lfsr_en;
   logic                  seed_en;
@@ -73,6 +57,6 @@
   );
 
   // "Scramble" the LFSR state.
-  assign data_o = aes_prng_scramble(lfsr_state);
+  assign data_o = prim_cipher_pkg::sbox4_64bit(lfsr_state, prim_cipher_pkg::PRINCE_SBOX4);
 
 endmodule
diff --git a/hw/ip/prim/prim.core b/hw/ip/prim/prim.core
index a7092b0..5a396e6 100644
--- a/hw/ip/prim/prim.core
+++ b/hw/ip/prim/prim.core
@@ -29,12 +29,13 @@
       - rtl/prim_fifo_async.sv
       - rtl/prim_fifo_sync.sv
       - rtl/prim_flop_2sync.sv
-      - rtl/prim_gate_gen.sv
       - rtl/prim_keccak.sv
       - rtl/prim_lfsr.sv
       - rtl/prim_packer.sv
+      - rtl/prim_cipher_pkg.sv
       - rtl/prim_present.sv
       - rtl/prim_prince.sv
+      - rtl/prim_gate_gen.sv
       - rtl/prim_pulse_sync.sv
       - rtl/prim_filter.sv
       - rtl/prim_filter_ctr.sv
diff --git a/hw/ip/prim/rtl/prim_cipher_pkg.sv b/hw/ip/prim/rtl/prim_cipher_pkg.sv
new file mode 100644
index 0000000..15df258
--- /dev/null
+++ b/hw/ip/prim/rtl/prim_cipher_pkg.sv
@@ -0,0 +1,363 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// This package holds common constants and functions for PRESENT- and
+// PRINCE-based scrambling devices.
+//
+// See also: prim_present, prim_prince
+//
+// References: - https://en.wikipedia.org/wiki/PRESENT
+//             - https://en.wikipedia.org/wiki/Prince_(cipher)
+//             - http://www.lightweightcrypto.org/present/present_ches2007.pdf
+//             - https://eprint.iacr.org/2012/529.pdf
+//             - https://eprint.iacr.org/2015/372.pdf
+//             - https://eprint.iacr.org/2014/656.pdf
+
+package prim_cipher_pkg;
+
+  ///////////////////
+  // PRINCE Cipher //
+  ///////////////////
+
+  parameter logic [15:0][3:0] PRINCE_SBOX4 = {4'h4, 4'hD, 4'h5, 4'hE,
+                                              4'h0, 4'h8, 4'h7, 4'h6,
+                                              4'h1, 4'h9, 4'hC, 4'hA,
+                                              4'h2, 4'h3, 4'hF, 4'hB};
+
+  parameter logic [15:0][3:0] PRINCE_SBOX4_INV = {4'h1, 4'hC, 4'hE, 4'h5,
+                                                  4'h0, 4'h4, 4'h6, 4'hA,
+                                                  4'h9, 4'h8, 4'hD, 4'hF,
+                                                  4'h2, 4'h3, 4'h7, 4'hB};
+  // nibble permutations
+  parameter logic [15:0][3:0] PRINCE_SHIFT_ROWS64  = '{4'hB, 4'h6, 4'h1, 4'hC,
+                                                       4'h7, 4'h2, 4'hD, 4'h8,
+                                                       4'h3, 4'hE, 4'h9, 4'h4,
+                                                       4'hF, 4'hA, 4'h5, 4'h0};
+
+  parameter logic [15:0][3:0] PRINCE_SHIFT_ROWS64_INV = '{4'h3, 4'h6, 4'h9, 4'hC,
+                                                          4'hF, 4'h2, 4'h5, 4'h8,
+                                                          4'hB, 4'hE, 4'h1, 4'h4,
+                                                          4'h7, 4'hA, 4'hD, 4'h0};
+
+  // these are the round constants
+  parameter logic [11:0][63:0] PRINCE_ROUND_CONST = {64'hC0AC29B7C97C50DD,
+                                                     64'hD3B5A399CA0C2399,
+                                                     64'h64A51195E0E3610D,
+                                                     64'hC882D32F25323C54,
+                                                     64'h85840851F1AC43AA,
+                                                     64'h7EF84F78FD955CB1,
+                                                     64'hBE5466CF34E90C6C,
+                                                     64'h452821E638D01377,
+                                                     64'h082EFA98EC4E6C89,
+                                                     64'hA4093822299F31D0,
+                                                     64'h13198A2E03707344,
+                                                     64'h0000000000000000};
+
+  // tweak constant for key modification between enc/dec modes
+  parameter logic [63:0] PRINCE_ALPHA_CONST = 64'hC0AC29B7C97C50DD;
+
+  // masking constants for shift rows function below
+  parameter logic [15:0] PRINCE_SHIFT_ROWS_CONST0 = 16'hEDB7;
+  parameter logic [15:0] PRINCE_SHIFT_ROWS_CONST1 = 16'h7EDB;
+  parameter logic [15:0] PRINCE_SHIFT_ROWS_CONST2 = 16'hB7ED;
+  parameter logic [15:0] PRINCE_SHIFT_ROWS_CONST3 = 16'hDB7E;
+
+  // nibble shifts
+  function automatic logic [31:0] prince_shiftrows_32bit(logic [31:0]      state_in,
+                                                         logic [15:0][3:0] shifts );
+    logic [31:0] state_out;
+    // note that if simulation performance becomes an issue, this loop can be unrolled
+    for (int k = 0; k < 32/2; k++) begin
+      // operate on pairs of 2bit instead of nibbles
+      state_out[k*2  +: 2] = state_in[shifts[k]*2  +: 2];
+    end
+    return state_out;
+  endfunction : prince_shiftrows_32bit
+
+  function automatic logic [63:0] prince_shiftrows_64bit(logic [63:0]      state_in,
+                                                         logic [15:0][3:0] shifts );
+    logic [63:0] state_out;
+    // note that if simulation performance becomes an issue, this loop can be unrolled
+    for (int k = 0; k < 64/4; k++) begin
+      state_out[k*4  +: 4] = state_in[shifts[k]*4  +: 4];
+    end
+    return state_out;
+  endfunction : prince_shiftrows_64bit
+
+  // XOR reduction of four nibbles in a 16bit subvector
+  function automatic logic [3:0] prince_nibble_red16(logic [15:0] vect);
+    return vect[0 +: 4] ^ vect[4 +: 4] ^ vect[8 +: 4] ^ vect[12 +: 4];
+  endfunction : prince_nibble_red16
+
+  // M prime multiplication
+  function automatic logic [31:0] prince_mult_prime_32bit(logic [31:0] state_in);
+    logic [31:0] state_out;
+    // M0
+    state_out[0  +: 4] = prince_nibble_red16(state_in[ 0 +: 16] & PRINCE_SHIFT_ROWS_CONST0);
+    state_out[4  +: 4] = prince_nibble_red16(state_in[ 0 +: 16] & PRINCE_SHIFT_ROWS_CONST1);
+    state_out[8  +: 4] = prince_nibble_red16(state_in[ 0 +: 16] & PRINCE_SHIFT_ROWS_CONST2);
+    state_out[12 +: 4] = prince_nibble_red16(state_in[ 0 +: 16] & PRINCE_SHIFT_ROWS_CONST3);
+    // M1
+    state_out[16 +: 4] = prince_nibble_red16(state_in[16 +: 16] & PRINCE_SHIFT_ROWS_CONST1);
+    state_out[20 +: 4] = prince_nibble_red16(state_in[16 +: 16] & PRINCE_SHIFT_ROWS_CONST2);
+    state_out[24 +: 4] = prince_nibble_red16(state_in[16 +: 16] & PRINCE_SHIFT_ROWS_CONST3);
+    state_out[28 +: 4] = prince_nibble_red16(state_in[16 +: 16] & PRINCE_SHIFT_ROWS_CONST0);
+    return state_out;
+  endfunction : prince_mult_prime_32bit
+
+  // M prime multiplication
+  function automatic logic [63:0] prince_mult_prime_64bit(logic [63:0] state_in);
+    logic [63:0] state_out;
+    // M0
+    state_out[0  +: 4] = prince_nibble_red16(state_in[ 0 +: 16] & PRINCE_SHIFT_ROWS_CONST0);
+    state_out[4  +: 4] = prince_nibble_red16(state_in[ 0 +: 16] & PRINCE_SHIFT_ROWS_CONST1);
+    state_out[8  +: 4] = prince_nibble_red16(state_in[ 0 +: 16] & PRINCE_SHIFT_ROWS_CONST2);
+    state_out[12 +: 4] = prince_nibble_red16(state_in[ 0 +: 16] & PRINCE_SHIFT_ROWS_CONST3);
+    // M1
+    state_out[16 +: 4] = prince_nibble_red16(state_in[16 +: 16] & PRINCE_SHIFT_ROWS_CONST1);
+    state_out[20 +: 4] = prince_nibble_red16(state_in[16 +: 16] & PRINCE_SHIFT_ROWS_CONST2);
+    state_out[24 +: 4] = prince_nibble_red16(state_in[16 +: 16] & PRINCE_SHIFT_ROWS_CONST3);
+    state_out[28 +: 4] = prince_nibble_red16(state_in[16 +: 16] & PRINCE_SHIFT_ROWS_CONST0);
+    // M1
+    state_out[32 +: 4] = prince_nibble_red16(state_in[32 +: 16] & PRINCE_SHIFT_ROWS_CONST1);
+    state_out[36 +: 4] = prince_nibble_red16(state_in[32 +: 16] & PRINCE_SHIFT_ROWS_CONST2);
+    state_out[40 +: 4] = prince_nibble_red16(state_in[32 +: 16] & PRINCE_SHIFT_ROWS_CONST3);
+    state_out[44 +: 4] = prince_nibble_red16(state_in[32 +: 16] & PRINCE_SHIFT_ROWS_CONST0);
+    // M0
+    state_out[48 +: 4] = prince_nibble_red16(state_in[48 +: 16] & PRINCE_SHIFT_ROWS_CONST0);
+    state_out[52 +: 4] = prince_nibble_red16(state_in[48 +: 16] & PRINCE_SHIFT_ROWS_CONST1);
+    state_out[56 +: 4] = prince_nibble_red16(state_in[48 +: 16] & PRINCE_SHIFT_ROWS_CONST2);
+    state_out[60 +: 4] = prince_nibble_red16(state_in[48 +: 16] & PRINCE_SHIFT_ROWS_CONST3);
+    return state_out;
+  endfunction : prince_mult_prime_64bit
+
+
+  ////////////////////
+  // PRESENT Cipher //
+  ////////////////////
+
+  // this is the sbox from the present cipher
+  parameter logic [15:0][3:0] PRESENT_SBOX4 = {4'h2, 4'h1, 4'h7, 4'h4,
+                                               4'h8, 4'hF, 4'hE, 4'h3,
+                                               4'hD, 4'hA, 4'h0, 4'h9,
+                                               4'hB, 4'h6, 4'h5, 4'hC};
+
+  parameter logic [15:0][3:0] PRESENT_SBOX4_INV = {4'hA, 4'h9, 4'h7, 4'h0,
+                                                   4'h3, 4'h6, 4'h4, 4'hB,
+                                                   4'hD, 4'h2, 4'h1, 4'hC,
+                                                   4'h8, 4'hF, 4'hE, 4'h5};
+
+  // these are modified permutation indices for a 32bit version that
+  // follow the same pattern as for the 64bit version
+  parameter logic [31:0][4:0] PRESENT_PERM32 = {5'd31, 5'd23, 5'd15, 5'd07,
+                                                5'd30, 5'd22, 5'd14, 5'd06,
+                                                5'd29, 5'd21, 5'd13, 5'd05,
+                                                5'd28, 5'd20, 5'd12, 5'd04,
+                                                5'd27, 5'd19, 5'd11, 5'd03,
+                                                5'd26, 5'd18, 5'd10, 5'd02,
+                                                5'd25, 5'd17, 5'd09, 5'd01,
+                                                5'd24, 5'd16, 5'd08, 5'd00};
+
+  parameter logic [31:0][4:0] PRESENT_PERM32_INV = {5'd31, 5'd27, 5'd23, 5'd19,
+                                                    5'd15, 5'd11, 5'd07, 5'd03,
+                                                    5'd30, 5'd26, 5'd22, 5'd18,
+                                                    5'd14, 5'd10, 5'd06, 5'd02,
+                                                    5'd29, 5'd25, 5'd21, 5'd17,
+                                                    5'd13, 5'd09, 5'd05, 5'd01,
+                                                    5'd28, 5'd24, 5'd20, 5'd16,
+                                                    5'd12, 5'd08, 5'd04, 5'd00};
+
+  // these are the permutation indices of the present cipher
+  parameter logic [63:0][5:0] PRESENT_PERM64 = {6'd63, 6'd47, 6'd31, 6'd15,
+                                                6'd62, 6'd46, 6'd30, 6'd14,
+                                                6'd61, 6'd45, 6'd29, 6'd13,
+                                                6'd60, 6'd44, 6'd28, 6'd12,
+                                                6'd59, 6'd43, 6'd27, 6'd11,
+                                                6'd58, 6'd42, 6'd26, 6'd10,
+                                                6'd57, 6'd41, 6'd25, 6'd09,
+                                                6'd56, 6'd40, 6'd24, 6'd08,
+                                                6'd55, 6'd39, 6'd23, 6'd07,
+                                                6'd54, 6'd38, 6'd22, 6'd06,
+                                                6'd53, 6'd37, 6'd21, 6'd05,
+                                                6'd52, 6'd36, 6'd20, 6'd04,
+                                                6'd51, 6'd35, 6'd19, 6'd03,
+                                                6'd50, 6'd34, 6'd18, 6'd02,
+                                                6'd49, 6'd33, 6'd17, 6'd01,
+                                                6'd48, 6'd32, 6'd16, 6'd00};
+
+  parameter logic [63:0][5:0] PRESENT_PERM64_INV = {6'd63, 6'd59, 6'd55, 6'd51,
+                                                    6'd47, 6'd43, 6'd39, 6'd35,
+                                                    6'd31, 6'd27, 6'd23, 6'd19,
+                                                    6'd15, 6'd11, 6'd07, 6'd03,
+                                                    6'd62, 6'd58, 6'd54, 6'd50,
+                                                    6'd46, 6'd42, 6'd38, 6'd34,
+                                                    6'd30, 6'd26, 6'd22, 6'd18,
+                                                    6'd14, 6'd10, 6'd06, 6'd02,
+                                                    6'd61, 6'd57, 6'd53, 6'd49,
+                                                    6'd45, 6'd41, 6'd37, 6'd33,
+                                                    6'd29, 6'd25, 6'd21, 6'd17,
+                                                    6'd13, 6'd09, 6'd05, 6'd01,
+                                                    6'd60, 6'd56, 6'd52, 6'd48,
+                                                    6'd44, 6'd40, 6'd36, 6'd32,
+                                                    6'd28, 6'd24, 6'd20, 6'd16,
+                                                    6'd12, 6'd08, 6'd04, 6'd00};
+
+  // forward key schedule
+  function automatic logic [63:0] present_update_key64(logic [63:0] key_in,
+                                                       logic [4:0]  round_idx);
+    logic [63:0] key_out;
+    // rotate by 61 to the left
+    key_out = 64'(key_in << 61) | 64'(key_in >> (64-61));
+    // sbox on uppermost 4 bits
+    key_out[63 -: 4] = PRESENT_SBOX4[key_out[63 -: 4]];
+    // xor in round counter on bits 19 to 15
+    key_out[19:15] ^= round_idx;
+    return key_out;
+  endfunction : present_update_key64
+
+  function automatic logic [79:0] present_update_key80(logic [79:0] key_in,
+                                                       logic [4:0]  round_idx);
+    logic [79:0] key_out;
+    // rotate by 61 to the left
+    key_out = 80'(key_in << 61) | 80'(key_in >> (80-61));
+    // sbox on uppermost 4 bits
+    key_out[79 -: 4] = PRESENT_SBOX4[key_out[79 -: 4]];
+    // xor in round counter on bits 19 to 15
+    key_out[19:15] ^= round_idx;
+    return key_out;
+  endfunction : present_update_key80
+
+  function automatic logic [127:0] present_update_key128(logic [127:0] key_in,
+                                                         logic [4:0]   round_idx);
+    logic [127:0] key_out;
+    // rotate by 61 to the left
+    key_out = 128'(key_in << 61) | 128'(key_in >> (128-61));
+    // sbox on uppermost 4 bits
+    key_out[127 -: 4] = PRESENT_SBOX4[key_out[127 -: 4]];
+    // xor in round counter on bits 19 to 15
+    key_out[19:15] ^= round_idx;
+    return key_out;
+  endfunction : present_update_key128
+
+
+  // inverse key schedule
+  function automatic logic [63:0] present_inv_update_key64(logic [63:0] key_in,
+                                                           logic [4:0]  round_idx,
+                                                           // total number of rounds employed
+                                                           logic [4:0]  round_cnt);
+    logic [63:0] key_out;
+    // xor in round counter on bits 19 to 15
+    key_out[19:15] ^= 6'(round_cnt) + 1 - round_idx;
+    // sbox on uppermost 4 bits
+    key_out[63 -: 4] = PRESENT_SBOX4_INV[key_out[63 -: 4]];
+    // rotate by 61 to the right
+    key_out = 64'(key_in >> 61) | 64'(key_in << (64-61));
+    return key_out;
+  endfunction : present_inv_update_key64
+
+  function automatic logic [79:0] present_inv_update_key80(logic [79:0] key_in,
+                                                           logic [4:0]  round_idx,
+                                                           // total number of rounds employed
+                                                           logic [4:0]  round_cnt);
+    logic [79:0] key_out;
+    // xor in round counter on bits 19 to 15
+    key_out[19:15] ^= 6'(round_cnt) + 1 - round_idx;
+    // sbox on uppermost 4 bits
+    key_out[79 -: 4] = PRESENT_SBOX4_INV[key_out[79 -: 4]];
+    // rotate by 61 to the right
+    key_out = 80'(key_in >> 61) | 80'(key_in << (80-61));
+    return key_out;
+  endfunction : present_inv_update_key80
+
+  function automatic logic [127:0] present_inv_update_key128(logic [127:0] key_in,
+                                                             logic [4:0]   round_idx,
+                                                             // total number of rounds employed
+                                                             logic [4:0]   round_cnt);
+    logic [127:0] key_out;
+    // xor in round counter on bits 19 to 15
+    key_out[19:15] ^= 6'(round_cnt) + 1 - round_idx;
+    // sbox on uppermost 4 bits
+    key_out[127 -: 4] = PRESENT_SBOX4_INV[key_out[127 -: 4]];
+    // rotate by 61 to the right
+    key_out = 128'(key_in >> 61) | 128'(key_in << (128-61));
+    return key_out;
+  endfunction : present_inv_update_key128
+
+
+  // these functions can be used to derive the DEC key from the ENC key by
+  // stepping the key by the correct number of rounds using the keyschedule functions above.
+  function automatic logic [63:0] present_get_dec_key64(logic [63:0] key_in,
+                                                        // total number of rounds employed
+                                                        logic [4:0]  round_cnt);
+    logic [63:0] key_out;
+    key_out = key_in;
+    for (int k = 0; k < round_cnt; k++) begin
+      key_out = present_update_key64(key_out, 5'(k + 1));
+    end
+    return key_out;
+  endfunction : present_get_dec_key64
+
+  function automatic logic [79:0] present_get_dec_key80(logic [79:0] key_in,
+                                                        // total number of rounds employed
+                                                        logic [4:0]  round_cnt);
+    logic [79:0] key_out;
+    key_out = key_in;
+    for (int k = 0; k < round_cnt; k++) begin
+      key_out = present_update_key80(key_out, 5'(k + 1));
+    end
+    return key_out;
+  endfunction : present_get_dec_key80
+
+  function automatic logic [127:0] present_get_dec_key128(logic [127:0] key_in,
+                                                          // total number of rounds employed
+                                                          logic [4:0]   round_cnt);
+    logic [127:0] key_out;
+    key_out = key_in;
+    for (int k = 0; k < round_cnt; k++) begin
+      key_out = present_update_key128(key_out, 5'(k + 1));
+    end
+    return key_out;
+  endfunction : present_get_dec_key128
+
+  /////////////////////////
+  // Common Subfunctions //
+  /////////////////////////
+
+  function automatic logic [31:0] sbox4_32bit(logic [31:0] state_in, logic [15:0][3:0] sbox4);
+    logic [31:0] state_out;
+    // note that if simulation performance becomes an issue, this loop can be unrolled
+    for (int k = 0; k < 32/4; k++) begin
+      state_out[k*4  +: 4] = sbox4[state_in[k*4  +: 4]];
+    end
+    return state_out;
+  endfunction : sbox4_32bit
+
+  function automatic logic [63:0] sbox4_64bit(logic [63:0] state_in, logic [15:0][3:0] sbox4);
+    logic [63:0] state_out;
+    // note that if simulation performance becomes an issue, this loop can be unrolled
+    for (int k = 0; k < 64/4; k++) begin
+      state_out[k*4  +: 4] = sbox4[state_in[k*4  +: 4]];
+    end
+    return state_out;
+  endfunction : sbox4_64bit
+
+  function automatic logic [31:0] perm_32bit(logic [31:0] state_in, logic [31:0][4:0] perm);
+    logic [31:0] state_out;
+    // note that if simulation performance becomes an issue, this loop can be unrolled
+    for (int k = 0; k < 32; k++) begin
+      state_out[k] = state_in[perm[k]];
+    end
+    return state_out;
+  endfunction : perm_32bit
+
+  function automatic logic [63:0] perm_64bit(logic [63:0] state_in, logic [63:0][5:0] perm);
+    logic [63:0] state_out;
+    // note that if simulation performance becomes an issue, this loop can be unrolled
+    for (int k = 0; k < 64; k++) begin
+      state_out[k] = state_in[perm[k]];
+    end
+    return state_out;
+  endfunction : perm_64bit
+
+endpackage : prim_cipher_pkg
diff --git a/hw/ip/prim/rtl/prim_gate_gen.sv b/hw/ip/prim/rtl/prim_gate_gen.sv
index 0e369d9..e97c7d5 100644
--- a/hw/ip/prim/rtl/prim_gate_gen.sv
+++ b/hw/ip/prim/rtl/prim_gate_gen.sv
@@ -9,9 +9,10 @@
 // them with registers, resulting in a split of around 50/50 between logic and
 // sequential cells.
 //
-// This generator has been tested with 32bit wide data, and it is accurate to
-// within around 250 GE. Do not use for fever than 500 GE.
+// This generator has been tested with 32bit wide data, and produces
+// the following results:
 //
+// if valid_i constantly set to 1'b1:
 // -------------+-----------+----------
 // requested GE | actual GE | GE error
 // -------------+-----------+----------
@@ -26,6 +27,24 @@
 // 25000        |   25228   |   228
 // 50000        |   50485   |   485
 //
+// otherwise, with clock gating enabled:
+// -------------+-----------+----------
+// requested GE | actual GE | GE error
+// -------------+-----------+----------
+// 500          |   696     |   196
+// 1000         |   1043    |   43
+// 1500         |   1737    |   237
+// 2500         |   2779    |   279
+// 5000         |   5340    |   340
+// 7500         |   7634    |   134
+// 10000        |   10284   |   284
+// 15000        |   15585   |   585
+// 25000        |   25855   |   855
+// 50000        |   51732   |   1732
+//
+// Note that the generator is not very accurate for smaller gate counts due
+// to the generate loop granularity. Hence, do not use for fever than 500 GE.
+
 
 module prim_gate_gen #(
   parameter int DataWidth = 32,
@@ -34,8 +53,10 @@
   input                        clk_i,
   input                        rst_ni,
 
+  input                        valid_i,
   input        [DataWidth-1:0] data_i,
-  output logic [DataWidth-1:0] data_o
+  output logic [DataWidth-1:0] data_o,
+  output                       valid_o
 );
 
   /////////////////////////////////////
@@ -54,43 +75,29 @@
   `ASSERT(DataMustBeMultipleOfFour_A, DataWidth % 4 == 0)
 
   /////////////////////
-  // Helper Function //
-  /////////////////////
-
-  // this is the sbox from the prince cipher
-  localparam logic[15:0][3:0] SBox4 = {4'h4, 4'hD, 4'h5, 4'hE,
-                                       4'h0, 4'h8, 4'h7, 4'h6,
-                                       4'h1, 4'h9, 4'hC, 4'hA,
-                                       4'h2, 4'h3, 4'hF, 4'hB};
-
-  function automatic logic [DataWidth-1:0] sbox4_layer(logic [DataWidth-1:0] state_in);
-    logic [DataWidth-1:0] state_out;
-    // note that if simulation performance becomes an issue, this loop can be unrolled
-    for (int k = 0; k < DataWidth/4; k++) begin
-      state_out[k*4  +: 4] = SBox4[state_in[k*4  +: 4]];
-    end
-    return state_out;
-  endfunction : sbox4_layer
-
-  /////////////////////
   // Generator Loops //
   /////////////////////
 
-  (* preserve *) logic [NumOuterRounds-1:0][DataWidth-1:0] regs_d, regs_q;
+  logic [NumOuterRounds-1:0][DataWidth-1:0] regs_d, regs_q;
+  logic [NumOuterRounds-1:0] valid_d, valid_q;
 
   for (genvar k = 0; k < NumOuterRounds; k++) begin : gen_outer_round
 
-    (* preserve *) logic [NumInnerRounds:0][DataWidth-1:0] inner_data;
+    logic [NumInnerRounds:0][DataWidth-1:0] inner_data;
 
     if (k==0) begin : gen_first
       assign inner_data[0] = data_i;
+      assign valid_d[0]    = valid_i;
     end else begin : gen_others
       assign inner_data[0] = regs_q[k-1];
+      assign valid_d[k]    = valid_q[k-1];
     end
 
     for (genvar l = 0; l < NumInnerRounds; l++) begin : gen_inner
       // 2bit rotation + sbox layer
-      assign inner_data[l+1] = sbox4_layer({inner_data[l][1:0], inner_data[l][DataWidth-1:2]});
+      assign inner_data[l+1] = prim_cipher_pkg::sbox4_32bit({inner_data[l][1:0],
+                                                             inner_data[l][DataWidth-1:2]},
+                                                             prim_cipher_pkg::PRINCE_SBOX4);
     end
 
     assign regs_d[k] = inner_data[NumInnerRounds];
@@ -99,11 +106,18 @@
   always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs
     if (!rst_ni) begin
       regs_q <= '0;
+      valid_q <= '0;
     end else begin
-      regs_q <= regs_d;
+      valid_q <= valid_d;
+      for (int k = 0; k < NumOuterRounds; k++) begin
+        if (valid_d[k]) begin
+          regs_q[k] <= regs_d[k];
+        end
+      end
     end
   end
 
   assign data_o = regs_q[NumOuterRounds-1];
+  assign valid_o = valid_q[NumOuterRounds-1];
 
 endmodule : prim_gate_gen
diff --git a/hw/ip/prim/rtl/prim_present.sv b/hw/ip/prim/rtl/prim_present.sv
index 83aa14e..bf3321c 100644
--- a/hw/ip/prim/rtl/prim_present.sv
+++ b/hw/ip/prim/rtl/prim_present.sv
@@ -12,7 +12,7 @@
 // 32bit variant is only intended to be used as a lightweight data scrambling
 // device.
 //
-// See also: prim_prince
+// See also: prim_prince, prim_cipher_pkg
 //
 // References: - https://en.wikipedia.org/wiki/PRESENT
 //             - https://en.wikipedia.org/wiki/Prince_(cipher)
@@ -24,117 +24,109 @@
 // synthesis experiments.
 
 module prim_present #(
-  parameter int DataWidth = 64, // {32, 64}
-  parameter int KeyWidth  = 80, // {64, 80, 128}
-  parameter int NumRounds = 31  // > 0
+  parameter int DataWidth = 64,  // {32, 64}
+  parameter int KeyWidth  = 128, // {64, 80, 128}
+  parameter int NumRounds = 31,  // > 0
+  // Note that the decryption pass needs a modified key,
+  // to be calculated by performing NumRounds key updates
+  parameter bit Decrypt   = 0    // 0: encrypt, 1: decrypt
 ) (
   input        [DataWidth-1:0] data_i,
   input        [KeyWidth-1:0]  key_i,
-  output logic [DataWidth-1:0] data_o
+  output logic [DataWidth-1:0] data_o,
+  output logic [KeyWidth-1:0]  key_o
 );
 
-  //////////////////////////////////
-  // helper functions / constants //
-  //////////////////////////////////
-
-  // this is the sbox from the present cipher
-  localparam logic[15:0][3:0] SBox4 = {4'h2, 4'h1, 4'h7, 4'h4,
-                                       4'h8, 4'hF, 4'hE, 4'h3,
-                                       4'hD, 4'hA, 4'h0, 4'h9,
-                                       4'hB, 4'h6, 4'h5, 4'hC};
-
-  // these are modified permutation indices for a 32bit version that
-  // follow the same pattern as for the 64bit version
-  localparam logic[31:0][4:0] Perm32 = {5'd31, 5'd23, 5'd15, 5'd7,
-                                        5'd30, 5'd22, 5'd14, 5'd6,
-                                        5'd29, 5'd21, 5'd13, 5'd5,
-                                        5'd28, 5'd20, 5'd12, 5'd4,
-                                        5'd27, 5'd19, 5'd11, 5'd3,
-                                        5'd26, 5'd18, 5'd10, 5'd2,
-                                        5'd25, 5'd17, 5'd9,  5'd1,
-                                        5'd24, 5'd16, 5'd8,  5'd0};
-
-  // these are the permutation indices of the present cipher
-  localparam logic[63:0][5:0] Perm64 = {6'd63, 6'd47, 6'd31, 6'd15,
-                                        6'd62, 6'd46, 6'd30, 6'd14,
-                                        6'd61, 6'd45, 6'd29, 6'd13,
-                                        6'd60, 6'd44, 6'd28, 6'd12,
-                                        6'd59, 6'd43, 6'd27, 6'd11,
-                                        6'd58, 6'd42, 6'd26, 6'd10,
-                                        6'd57, 6'd41, 6'd25, 6'd09,
-                                        6'd56, 6'd40, 6'd24, 6'd08,
-                                        6'd55, 6'd39, 6'd23, 6'd07,
-                                        6'd54, 6'd38, 6'd22, 6'd06,
-                                        6'd53, 6'd37, 6'd21, 6'd05,
-                                        6'd52, 6'd36, 6'd20, 6'd04,
-                                        6'd51, 6'd35, 6'd19, 6'd03,
-                                        6'd50, 6'd34, 6'd18, 6'd02,
-                                        6'd49, 6'd33, 6'd17, 6'd01,
-                                        6'd48, 6'd32, 6'd16, 6'd00};
-
-  function automatic logic [DataWidth-1:0] sbox4_layer(logic [DataWidth-1:0] state_in);
-    logic [63:0] state_out;
-    // note that if simulation performance becomes an issue, this loop can be unrolled
-    for (int k = 0; k < DataWidth/4; k++) begin
-      state_out[k*4  +: 4] = SBox4[state_in[k*4  +: 4]];
-    end
-    return state_out;
-  endfunction : sbox4_layer
-
-  function automatic logic [DataWidth-1:0] perm_layer(logic [DataWidth-1:0] state_in);
-    logic [DataWidth-1:0] state_out;
-    if (DataWidth == 64) begin
-      // note that if simulation performance becomes an issue, this loop can be unrolled
-      for (int k = 0; k < DataWidth; k++) begin
-        state_out[k] = state_in[Perm64[k]];
-      end
-    end else begin
-      // note that if simulation performance becomes an issue, this loop can be unrolled
-      for (int k = 0; k < DataWidth; k++) begin
-        state_out[k] = state_in[Perm32[k]];
-      end
-    end
-    return state_out;
-  endfunction : perm_layer
-
-  function automatic logic [KeyWidth-1:0] update_key(logic [KeyWidth-1:0] key_in,
-                                                     logic [4:0] round_cnt);
-    logic [KeyWidth-1:0] key_out;
-    // rotate by 61 to the left
-    key_out = KeyWidth'(key_in << 61) | KeyWidth'(key_in >> (KeyWidth-61));
-    // sbox on uppermost 4 bits
-    key_out[KeyWidth-1 -: 4] = SBox4[key_out[KeyWidth-1 -: 4]];
-    // xor in round counter on bits 19 to 15
-    key_out[19:15] ^= round_cnt;
-    return key_out;
-  endfunction : update_key
-
   //////////////
   // datapath //
   //////////////
 
-  logic [DataWidth-1:0] data_state;
-  logic [KeyWidth-1:0]  round_key;
-  always_comb begin : p_present
-    data_state = data_i;
-    round_key  = key_i;
-    for (int k = 0; k < NumRounds; k++) begin
-      // cipher layers
-      data_state = data_state ^ round_key[KeyWidth-1 : KeyWidth-DataWidth];
-      data_state = sbox4_layer(data_state);
-      data_state = perm_layer(data_state);
+  logic [NumRounds:0][DataWidth-1:0] data_state;
+  logic [NumRounds:0][KeyWidth-1:0]  round_key;
+
+  // initialize
+  assign data_state[0] = data_i;
+  assign round_key[0]  = key_i;
+
+  for (genvar k = 0; k < NumRounds; k++) begin : gen_round
+    logic [DataWidth-1:0] data_state_xor, data_state_sbox;
+    // cipher layers
+    assign data_state_xor  = data_state[k] ^ round_key[k][KeyWidth-1 : KeyWidth-DataWidth];
+
+    ////////////////////////////////
+    // decryption pass, performs inverse permutation, sbox and keyschedule
+    if (Decrypt) begin : gen_dec
+      // original 64bit variant
+      if (DataWidth == 64) begin : gen_d64
+        assign data_state_sbox = prim_cipher_pkg::perm_64bit(data_state_xor,
+                                                             prim_cipher_pkg::PRESENT_PERM64_INV);
+        assign data_state[k+1] = prim_cipher_pkg::sbox4_64bit(data_state_sbox,
+                                                              prim_cipher_pkg::PRESENT_SBOX4_INV);
+      // reduced 32bit variant
+      end else begin : gen_d32
+        assign data_state_sbox = prim_cipher_pkg::perm_32bit(data_state_xor,
+                                                             prim_cipher_pkg::PRESENT_PERM32_INV);
+        assign data_state[k+1] = prim_cipher_pkg::sbox4_32bit(data_state_sbox,
+                                                              prim_cipher_pkg::PRESENT_SBOX4_INV);
+      end
       // update round key, count goes from 1 to 31 (max)
-      round_key  = update_key(round_key, 5'(k + 1));
-    end
-    data_o = data_state ^ round_key;
-  end
+      // original 128bit key variant
+      if (KeyWidth == 128) begin : gen_k128
+        assign round_key[k+1]  = prim_cipher_pkg::present_inv_update_key128(round_key[k],
+                                                                            5'(k + 1),
+                                                                            5'(NumRounds));
+      // original 80bit key variant
+      end else if (KeyWidth == 80) begin : gen_k80
+        assign round_key[k+1]  = prim_cipher_pkg::present_inv_update_key80(round_key[k],
+                                                                           5'(k + 1),
+                                                                           5'(NumRounds));
+      // reduced 64bit key variant
+      end else begin : gen_k64
+        assign round_key[k+1]  = prim_cipher_pkg::present_inv_update_key64(round_key[k],
+                                                                           5'(k + 1),
+                                                                           5'(NumRounds));
+      end
+    ////////////////////////////////
+    // encryption pass
+    end else begin : gen_enc
+      // original 64bit variant
+      if (DataWidth == 64) begin : gen_d64
+        assign data_state_sbox = prim_cipher_pkg::sbox4_64bit(data_state_xor,
+                                                              prim_cipher_pkg::PRESENT_SBOX4);
+        assign data_state[k+1] = prim_cipher_pkg::perm_64bit(data_state_sbox,
+                                                             prim_cipher_pkg::PRESENT_PERM64);
+      // reduced 32bit variant
+      end else begin : gen_d32
+        assign data_state_sbox = prim_cipher_pkg::sbox4_32bit(data_state_xor,
+                                                              prim_cipher_pkg::PRESENT_SBOX4);
+        assign data_state[k+1] = prim_cipher_pkg::perm_32bit(data_state_sbox,
+                                                             prim_cipher_pkg::PRESENT_PERM32);
+      end
+      // update round key, count goes from 1 to 31 (max)
+      // original 128bit key variant
+      if (KeyWidth == 128) begin : gen_k128
+        assign round_key[k+1]  = prim_cipher_pkg::present_update_key128(round_key[k], 5'(k + 1));
+      // original 80bit key variant
+      end else if (KeyWidth == 80) begin : gen_k80
+        assign round_key[k+1]  = prim_cipher_pkg::present_update_key80(round_key[k], 5'(k + 1));
+      // reduced 64bit key variant
+      end else begin : gen_k64
+        assign round_key[k+1]  = prim_cipher_pkg::present_update_key64(round_key[k], 5'(k + 1));
+      end
+    end // gen_enc
+    ////////////////////////////////
+  end // gen_round
+
+  // finalize
+  assign data_o = data_state[NumRounds] ^ round_key[NumRounds][KeyWidth-1 : KeyWidth-DataWidth];
+  assign key_o  = round_key[NumRounds];
 
   ////////////////
   // assertions //
   ////////////////
 
-  `ASSERT_INIT(SupportedDataWidth_A, DataWidth inside {32, 64})
-  `ASSERT_INIT(SupportedKeyWidth_A, KeyWidth inside {64, 80, 128})
-  `ASSERT_INIT(SupportedNumRounds_A, NumRounds > 0)
+  `ASSERT_INIT(SupportedWidths_A, (DataWidth == 64 && KeyWidth inside {80, 128}) ||
+                                  (DataWidth == 32 && KeyWidth == 64))
+  `ASSERT_INIT(SupportedNumRounds_A, NumRounds > 0 && NumRounds <= 31)
 
 endmodule : prim_present
diff --git a/hw/ip/prim/rtl/prim_prince.sv b/hw/ip/prim/rtl/prim_prince.sv
index 355c916..bd5e50d 100644
--- a/hw/ip/prim/rtl/prim_prince.sv
+++ b/hw/ip/prim/rtl/prim_prince.sv
@@ -12,7 +12,7 @@
 // strength is required. The 32bit variant is only intended to be used as a
 // lightweight data scrambling device.
 //
-// See also: prim_present
+// See also: prim_present, prim_cipher_pkg
 //
 // References: - https://en.wikipedia.org/wiki/PRESENT
 //             - https://en.wikipedia.org/wiki/Prince_(cipher)
@@ -41,150 +41,13 @@
   output logic [DataWidth-1:0] data_o
 );
 
-  //////////////////////////////////
-  // helper functions / constants //
-  //////////////////////////////////
+  ///////////////////
+  // key expansion //
+  ///////////////////
 
-  // this is the sbox from the prince cipher
-  localparam logic[15:0][3:0] SBox4 = {4'h4, 4'hD, 4'h5, 4'hE,
-                                       4'h0, 4'h8, 4'h7, 4'h6,
-                                       4'h1, 4'h9, 4'hC, 4'hA,
-                                       4'h2, 4'h3, 4'hF, 4'hB};
-
-  localparam logic[15:0][3:0] SBox4Inv = {4'h1, 4'hC, 4'hE, 4'h5,
-                                          4'h0, 4'h4, 4'h6, 4'hA,
-                                          4'h9, 4'h8, 4'hD, 4'hF,
-                                          4'h2, 4'h3, 4'h7, 4'hB};
-  // nibble permutations
-  localparam logic [15:0][3:0] Shiftrows64  = '{4'hB, 4'h6, 4'h1, 4'hC,
-                                                4'h7, 4'h2, 4'hD, 4'h8,
-                                                4'h3, 4'hE, 4'h9, 4'h4,
-                                                4'hF, 4'hA, 4'h5, 4'h0};
-
-  localparam logic [15:0][3:0] Shiftrows64Inv = '{4'h3, 4'h6, 4'h9, 4'hC,
-                                                  4'hF, 4'h2, 4'h5, 4'h8,
-                                                  4'hB, 4'hE, 4'h1, 4'h4,
-                                                  4'h7, 4'hA, 4'hD, 4'h0};
-
-  // these are the round constants
-  localparam logic[11:0][63:0] RoundConst = {64'hC0AC29B7C97C50DD,
-                                             64'hD3B5A399CA0C2399,
-                                             64'h64A51195E0E3610D,
-                                             64'hC882D32F25323C54,
-                                             64'h85840851F1AC43AA,
-                                             64'h7EF84F78FD955CB1,
-                                             64'hBE5466CF34E90C6C,
-                                             64'h452821E638D01377,
-                                             64'h082EFA98EC4E6C89,
-                                             64'hA4093822299F31D0,
-                                             64'h13198A2E03707344,
-                                             64'h0000000000000000};
-
-  // tweak constant for key modification between enc/dec modes
-  localparam logic [63:0] AlphaConst = 64'hC0AC29B7C97C50DD;
-
-  function automatic logic [DataWidth-1:0] sbox4_layer(logic [DataWidth-1:0] state_in);
-    logic [DataWidth-1:0] state_out;
-    // note that if simulation performance becomes an issue, this loop can be unrolled
-    for (int k = 0; k < DataWidth/4; k++) begin
-      state_out[k*4  +: 4] = SBox4[state_in[k*4  +: 4]];
-    end
-    return state_out;
-  endfunction : sbox4_layer
-
-  function automatic logic [DataWidth-1:0] sbox4_inv_layer(logic [DataWidth-1:0] state_in);
-    logic [DataWidth-1:0] state_out;
-    // note that if simulation performance becomes an issue, this loop can be unrolled
-    for (int k = 0; k < DataWidth/4; k++) begin
-      state_out[k*4  +: 4] = SBox4Inv[state_in[k*4  +: 4]];
-    end
-    return state_out;
-  endfunction : sbox4_inv_layer
-
-  // nibble shifts
-  function automatic logic [DataWidth-1:0] shiftrows_layer(logic [DataWidth-1:0] state_in);
-    logic [DataWidth-1:0] state_out;
-    if (DataWidth == 64) begin
-      // note that if simulation performance becomes an issue, this loop can be unrolled
-      for (int k = 0; k < DataWidth/4; k++) begin
-        state_out[k*4  +: 4] = state_in[Shiftrows64[k]*4  +: 4];
-      end
-    end else begin
-      // note that if simulation performance becomes an issue, this loop can be unrolled
-      for (int k = 0; k < DataWidth/2; k++) begin
-        // operate on pairs of 2bit instead of nibbles
-        state_out[k*2  +: 2] = state_in[Shiftrows64[k]*2  +: 2];
-      end
-    end
-    return state_out;
-  endfunction : shiftrows_layer
-
-  function automatic logic [DataWidth-1:0] shiftrows_inv_layer(logic [DataWidth-1:0] state_in);
-    logic [DataWidth-1:0] state_out;
-    if (DataWidth == 64) begin
-      // note that if simulation performance becomes an issue, this loop can be unrolled
-      for (int k = 0; k < DataWidth/4; k++) begin
-        state_out[k*4  +: 4] = state_in[Shiftrows64Inv[k]*4  +: 4];
-      end
-    end else begin
-      // note that if simulation performance becomes an issue, this loop can be unrolled
-      for (int k = 0; k < DataWidth/2; k++) begin
-        // operate on pairs of 2bit instead of nibbles
-        state_out[k*2  +: 2] = state_in[Shiftrows64Inv[k]*2  +: 2];
-      end
-    end
-    return state_out;
-  endfunction : shiftrows_inv_layer
-
-  // XOR reduction of four nibbles in a 16bit subvector
-  function automatic logic [3:0] nibble_red16(logic [15:0] vect);
-    return vect[0 +: 4] ^ vect[4 +: 4] ^ vect[8 +: 4] ^ vect[12 +: 4];
-  endfunction : nibble_red16
-
-  // M prime multiplication
-  function automatic logic [DataWidth-1:0] mult_prime_layer(logic [DataWidth-1:0] state_in);
-    logic [DataWidth-1:0] state_out;
-    // M0
-    state_out[0  +: 4] = nibble_red16(state_in[ 0 +: 16] & 16'hEDB7);
-    state_out[4  +: 4] = nibble_red16(state_in[ 0 +: 16] & 16'h7EDB);
-    state_out[8  +: 4] = nibble_red16(state_in[ 0 +: 16] & 16'hB7ED);
-    state_out[12 +: 4] = nibble_red16(state_in[ 0 +: 16] & 16'hDB7E);
-    // M1
-    state_out[16 +: 4] = nibble_red16(state_in[16 +: 16] & 16'h7EDB);
-    state_out[20 +: 4] = nibble_red16(state_in[16 +: 16] & 16'hB7ED);
-    state_out[24 +: 4] = nibble_red16(state_in[16 +: 16] & 16'hDB7E);
-    state_out[28 +: 4] = nibble_red16(state_in[16 +: 16] & 16'hEDB7);
-    if (DataWidth == 64) begin
-      // M1
-      state_out[32 +: 4] = nibble_red16(state_in[32 +: 16] & 16'h7EDB);
-      state_out[36 +: 4] = nibble_red16(state_in[32 +: 16] & 16'hB7ED);
-      state_out[40 +: 4] = nibble_red16(state_in[32 +: 16] & 16'hDB7E);
-      state_out[44 +: 4] = nibble_red16(state_in[32 +: 16] & 16'hEDB7);
-      // M0
-      state_out[48 +: 4] = nibble_red16(state_in[48 +: 16] & 16'hEDB7);
-      state_out[52 +: 4] = nibble_red16(state_in[48 +: 16] & 16'h7EDB);
-      state_out[56 +: 4] = nibble_red16(state_in[48 +: 16] & 16'hB7ED);
-      state_out[60 +: 4] = nibble_red16(state_in[48 +: 16] & 16'hDB7E);
-    end
-    return state_out;
-  endfunction : mult_prime_layer
-
-  //////////////
-  // datapath //
-  //////////////
-
-  logic [DataWidth-1:0] data_state;
   logic [DataWidth-1:0] k0, k0_prime, k1, k0_new;
 
-  if (UseOldKeySched) begin : gen_legacy_keyschedule
-    assign k0_new = k1;
-  end else begin : gen_new_keyschedule
-    // improved keyschedule proposed by https://eprint.iacr.org/2014/656.pdf
-    assign k0_new = k0;
-  end
-
-  always_comb begin : p_prince
-    // key expansion
+  always_comb begin : p_key_expansion
     k0       = key_i[DataWidth-1:0];
     k0_prime = {k0[0], k0[DataWidth-1:2], k0[DataWidth-1] ^ k0[1]};
     k1       = key_i[2*DataWidth-1 : DataWidth];
@@ -193,52 +56,125 @@
     if (dec_i) begin
       k0       = k0_prime;
       k0_prime = key_i[DataWidth-1:0];
-      k1       ^= AlphaConst[DataWidth-1:0];
+      k1       ^= prim_cipher_pkg::PRINCE_ALPHA_CONST[DataWidth-1:0];
     end
+  end
 
-    // pre-rounds
-    data_state = data_i ^ k0;
-    data_state ^= k1;
-    data_state ^= RoundConst[0][DataWidth-1:0];
+  if (UseOldKeySched) begin : gen_legacy_keyschedule
+    assign k0_new = k1;
+  end else begin : gen_new_keyschedule
+    // improved keyschedule proposed by https://eprint.iacr.org/2014/656.pdf
+    assign k0_new = k0;
+  end
 
-    // forward pass
-    for (int k = 1; k <= NumRoundsHalf; k++) begin
-      data_state = sbox4_layer(data_state);
-      data_state = mult_prime_layer(data_state);
-      data_state = shiftrows_layer(data_state);
-      data_state ^= RoundConst[k][DataWidth-1:0];
-      // improved keyschedule proposed by https://eprint.iacr.org/2014/656.pdf
-      data_state ^= (1'(k) & 1'b1) ? k0_new : k1;
+  //////////////
+  // datapath //
+  //////////////
+
+  // state variable for holding the rounds
+  logic [NumRoundsHalf*2+1:0][DataWidth-1:0] data_state;
+
+  // pre-round XOR
+  always_comb begin : p_pre_round_xor
+    data_state[0] = data_i ^ k0;
+    data_state[0] ^= k1;
+    data_state[0] ^= prim_cipher_pkg::PRINCE_ROUND_CONST[0][DataWidth-1:0];
+  end
+
+  // forward pass
+  for (genvar k = 1; k <= NumRoundsHalf; k++) begin : gen_fwd_pass
+    logic [DataWidth-1:0] data_state_round;
+    if (DataWidth == 64) begin : gen_fwd_d64
+      always_comb begin : p_fwd_d64
+        data_state_round = prim_cipher_pkg::sbox4_64bit(data_state[k-1],
+            prim_cipher_pkg::PRINCE_SBOX4);
+        data_state_round = prim_cipher_pkg::prince_mult_prime_64bit(data_state_round);
+        data_state_round = prim_cipher_pkg::prince_shiftrows_64bit(data_state_round,
+            prim_cipher_pkg::PRINCE_SHIFT_ROWS64);
+      end
+    end else begin : gen_fwd_d32
+      always_comb begin : p_fwd_d32
+        data_state_round = prim_cipher_pkg::sbox4_32bit(data_state[k-1],
+            prim_cipher_pkg::PRINCE_SBOX4);
+        data_state_round = prim_cipher_pkg::prince_mult_prime_32bit(data_state_round);
+        data_state_round = prim_cipher_pkg::prince_shiftrows_32bit(data_state_round,
+            prim_cipher_pkg::PRINCE_SHIFT_ROWS64);
+      end
     end
+    logic [DataWidth-1:0] data_state_xor;
+    assign data_state_xor = data_state_round ^
+                            prim_cipher_pkg::PRINCE_ROUND_CONST[k][DataWidth-1:0];
+    // improved keyschedule proposed by https://eprint.iacr.org/2014/656.pdf
+    if (k % 2 == 1) assign data_state[k]  = data_state_xor ^ k0_new;
+    else            assign data_state[k]  = data_state_xor ^ k1;
+  end
 
-    // middle part
-    data_state = sbox4_layer(data_state);
-    data_state = mult_prime_layer(data_state);
-    data_state = sbox4_inv_layer(data_state);
+  // middle part
+  logic [DataWidth-1:0] data_state_middle;
+  if (DataWidth == 64) begin : gen_middle_d64
+    always_comb begin : p_middle_d64
+      data_state_middle = prim_cipher_pkg::sbox4_64bit(data_state[NumRoundsHalf],
+          prim_cipher_pkg::PRINCE_SBOX4);
+      data_state_middle = prim_cipher_pkg::prince_mult_prime_64bit(data_state_middle);
+      data_state_middle = prim_cipher_pkg::sbox4_64bit(data_state_middle,
+          prim_cipher_pkg::PRINCE_SBOX4_INV);
+    end
+  end else begin : gen_middle_d32
+    always_comb begin : p_middle_d32
+      data_state_middle = prim_cipher_pkg::sbox4_32bit(data_state_middle[NumRoundsHalf],
+          prim_cipher_pkg::PRINCE_SBOX4);
+      data_state_middle = prim_cipher_pkg::prince_mult_prime_32bit(data_state_middle);
+      data_state_middle = prim_cipher_pkg::sbox4_32bit(data_state_middle,
+          prim_cipher_pkg::PRINCE_SBOX4_INV);
+    end
+  end
 
-    // reverse pass
+  assign data_state[NumRoundsHalf+1] = data_state_middle;
+
+  // backward pass
+  for (genvar k = 1; k <= NumRoundsHalf; k++) begin : gen_bwd_pass
+    logic [DataWidth-1:0] data_state_xor0, data_state_xor1;
+    // improved keyschedule proposed by https://eprint.iacr.org/2014/656.pdf
+    if (k % 2 == 1) assign data_state_xor0 = data_state[NumRoundsHalf+k] ^ k0_new;
+    else            assign data_state_xor0 = data_state[NumRoundsHalf+k] ^ k1;
     // the construction is reflective, hence the subtraction with NumRoundsHalf
-    for (int k = 11-NumRoundsHalf; k <= 10; k++) begin
-      // improved keyschedule proposed by https://eprint.iacr.org/2014/656.pdf
-      data_state ^= (1'(k) & 1'b1) ? k1 : k0_new;
-      data_state ^= RoundConst[k][DataWidth-1:0];
-      data_state = shiftrows_inv_layer(data_state);
-      data_state = mult_prime_layer(data_state);
-      data_state = sbox4_inv_layer(data_state);
-    end
+    assign data_state_xor1 = data_state_xor0 ^
+                             prim_cipher_pkg::PRINCE_ROUND_CONST[10-NumRoundsHalf+k][DataWidth-1:0];
 
-    // post-rounds
-    data_state ^= RoundConst[11][DataWidth-1:0];
-    data_state ^= k1;
-    data_o     = data_state ^ k0_prime;
+    logic [DataWidth-1:0] data_state_bwd;
+    if (DataWidth == 64) begin : gen_bwd_d64
+      always_comb begin : p_bwd_d64
+        data_state_bwd = prim_cipher_pkg::prince_shiftrows_64bit(data_state_xor1,
+            prim_cipher_pkg::PRINCE_SHIFT_ROWS64_INV);
+        data_state_bwd = prim_cipher_pkg::prince_mult_prime_64bit(data_state_bwd);
+        data_state[NumRoundsHalf+k+1] = prim_cipher_pkg::sbox4_64bit(data_state_bwd,
+            prim_cipher_pkg::PRINCE_SBOX4_INV);
+      end
+    end else begin : gen_bwd_d32
+      always_comb begin : p_bwd_d32
+        data_state_bwd = prim_cipher_pkg::prince_shiftrows_32bit(data_state_xor1,
+            prim_cipher_pkg::PRINCE_SHIFT_ROWS64_INV);
+        data_state_bwd = prim_cipher_pkg::prince_mult_prime_32bit(data_state_bwd);
+        data_state[NumRoundsHalf+k+1] = prim_cipher_pkg::sbox4_32bit(data_state_bwd,
+            prim_cipher_pkg::PRINCE_SBOX4_INV);
+      end
+    end
+  end
+
+  // post-rounds
+  always_comb begin : p_post_round_xor
+    data_o  = data_state[2*NumRoundsHalf+1] ^
+              prim_cipher_pkg::PRINCE_ROUND_CONST[11][DataWidth-1:0];
+    data_o ^= k1;
+    data_o ^= k0_prime;
   end
 
   ////////////////
   // assertions //
   ////////////////
 
-  `ASSERT_INIT(SupportedWidths_A, DataWidth == 64 && KeyWidth == 128 ||
-                                  DataWidth == 32 && KeyWidth == 64)
+  `ASSERT_INIT(SupportedWidths_A, (DataWidth == 64 && KeyWidth == 128) ||
+                                  (DataWidth == 32 && KeyWidth == 64))
   `ASSERT_INIT(SupportedNumRounds_A, NumRoundsHalf > 0 && NumRoundsHalf < 6)