hw/ip/aes/rtl/aes_sbox_dom.sv - 3p/lowrisc/opentitan - Git at Google

 // Copyright lowRISC contributors.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
 // AES S-Box with First-Order Domain-Oriented Masking
 //
 // This is the unpipelined version using DOM-dep multipliers. It has a latency of 5 clock cycles
 // and requires 28 bits of pseudo-random data per evaluation. Pipelining would only be beneficial
 // when using
 // - either a cipher core architecture with a data path smaller than 128 bit, i.e., where the
 //   individual S-Boxes are evaluated more than once per round, or
 // - a fully unrolled cipher core architecture for maximum throughput.
 //
 // Note: The DOM AES S-Box is built on top of the Canright masked S-Box without mask re-use.
 //
 // For details, see the following papers and reports:
 // [1] Gross, "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary
 //     Protection Order" available at https://eprint.iacr.org/2016/486.pdf
 // [2] Canright, "A very compact 'perfectly masked' S-box for AES (corrected)" available at
 //     https://eprint.iacr.org/2009/011.pdf
 // [3] Canright, "A very compact Rijndael S-box" available at https://hdl.handle.net/10945/25608
 //
 // Using the Coco-Alma tool in transient mode, this implementation has been formally verified to be
 // secure against first-order side-channel analysis (SCA). For more information on the tool,
 // refer to the following papers:
 // [4] Gigerl, "COCO: Co-design and co-verification of masked software implementations on CPUs"
 //     available at https://eprint.iacr.org/2020/1294.pdf
 // [5] Bloem, "Formal verification of masked hardware implementations in the presence of glitches"
 //     available at https://eprint.iacr.org/2017/897.pdf

 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // IMPORTANT NOTE:                                                                               //
 //                            DO NOT USE THIS FOR SYNTHESIS BLINDLY!                             //
 //                                                                                               //
 // This implementation relies on primitive cells like prim_buf/flop_en containing tool-specific  //
 // synthesis attributes to prevent the synthesis tool from optimizing away/re-ordering registers //
 // and to enforce the correct ordering of operations. Without the proper primitives, synthesis   //
 // tools might heavily optimize the design. The result is likely insecure. Use with care.        //
 ///////////////////////////////////////////////////////////////////////////////////////////////////

 `include "prim_assert.sv"

 // Packed struct for pseudo-random data (PRD) input. Stages 1, 3 and 4 require 8 bits each. Stage 2
 // requires just 4 bits.
 typedef struct packed {
   logic [7:0] prd_1;
   logic [3:0] prd_2;
   logic [7:0] prd_3;
   logic [7:0] prd_4;
 } prd_in_t;

 // Packed struct for pseudo-random data (PRD) output. Stages 2 and 3 produce 8 bits each. Stage 1
 // produces just 4 bits.
 typedef struct packed {
   logic [3:0] prd_1;
   logic [7:0] prd_2;
   logic [7:0] prd_3;
 } prd_out_t;

 // DOM-indep GF(2^N) multiplier, first-order masked.
 // Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
 // domain-oriented masking. The sharings of x and y are required to be uniformly random and
 // independent from each other.
 // See Fig. 2 in [1].
 module aes_dom_indep_mul_gf2pn #(
   parameter int unsigned NPower   = 4,
   parameter bit          Pipeline = 1'b0
 ) (
   input  logic              clk_i,
   input  logic              rst_ni,
   input  logic              we_i,
   input  logic [NPower-1:0] a_x,    // Share a of x
   input  logic [NPower-1:0] a_y,    // Share a of y
   input  logic [NPower-1:0] b_x,    // Share b of x
   input  logic [NPower-1:0] b_y,    // Share b of y
   input  logic [NPower-1:0] z_0,    // Randomness for resharing
   output logic [NPower-1:0] a_q,    // Share a of q
   output logic [NPower-1:0] b_q     // Share b of q
 );

   import aes_sbox_canright_pkg::*;

   /////////////////
   // Calculation //
   /////////////////
   // Inner-domain terms
   logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
   if (NPower == 4) begin : gen_inner_mul_gf2p4
     assign mul_ax_ay_d = aes_mul_gf2p4(a_x, a_y);
     assign mul_bx_by_d = aes_mul_gf2p4(b_x, b_y);

   end else begin : gen_inner_mul_gf2p2
     assign mul_ax_ay_d = aes_mul_gf2p2(a_x, a_y);
     assign mul_bx_by_d = aes_mul_gf2p2(b_x, b_y);
   end

   // Cross-domain terms
   logic [NPower-1:0] mul_ax_by, mul_ay_bx;
   if (NPower == 4) begin : gen_cross_mul_gf2p4
     assign mul_ax_by = aes_mul_gf2p4(a_x, b_y);
     assign mul_ay_bx = aes_mul_gf2p4(a_y, b_x);

   end else begin : gen_cross_mul_gf2p2
     assign mul_ax_by = aes_mul_gf2p2(a_x, b_y);
     assign mul_ay_bx = aes_mul_gf2p2(a_y, b_x);
   end

   ///////////////
   // Resharing //
   ///////////////
   // Resharing of cross-domain terms
   logic [NPower-1:0] aq_z0_d, bq_z0_d;
   logic [NPower-1:0] aq_z0_q, bq_z0_q;
   assign aq_z0_d = z_0 ^ mul_ax_by;
   assign bq_z0_d = z_0 ^ mul_ay_bx;

   // Registers
   prim_flop_en #(
     .Width      ( 2*NPower ),
     .ResetValue ( '0       )
   ) u_prim_flop_abq_z0 (
     .clk_i  ( clk_i              ),
     .rst_ni ( rst_ni             ),
     .en_i   ( we_i               ),
     .d_i    ( {aq_z0_d, bq_z0_d} ),
     .q_o    ( {aq_z0_q, bq_z0_q} )
   );

   /////////////////////////
   // Optional Pipelining //
   /////////////////////////
   logic [NPower-1:0] mul_ax_ay, mul_bx_by;

   if (Pipeline == 1'b1) begin : gen_pipeline
     // Add pipeline registers on inner-domain terms prior to integration. This allows accepting new
     // input data every clock cycle and prevents SCA leakage occurring due to the integration of
     // reshared cross-domain terms with inner-domain terms derived from different input data.

     logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
     prim_flop_en #(
       .Width      ( 2*NPower ),
       .ResetValue ( '0       )
     ) u_prim_flop_mul_abx_aby (
       .clk_i  ( clk_i                      ),
       .rst_ni ( rst_ni                     ),
       .en_i   ( we_i                       ),
       .d_i    ( {mul_ax_ay_d, mul_bx_by_d} ),
       .q_o    ( {mul_ax_ay_q, mul_bx_by_q} )
     );

     assign mul_ax_ay = mul_ax_ay_q;
     assign mul_bx_by = mul_bx_by_q;

   end else begin : gen_no_pipeline
     // Do not add the optional pipeline registers on the inner-domain terms. This allows to save
     // some area in case the multiplier does not need to accept new data in every cycle. However,
     // this can cause SCA leakage as during the clock cycle in which new data arrives, the new
     // inner-domain terms are integrated with the previous, reshared cross-domain terms.

     // Avoid aggressive synthesis optimizations.
     logic [NPower-1:0] mul_ax_ay_buf, mul_bx_by_buf;
     prim_buf #(
       .Width  ( 2*NPower )
     ) u_prim_buf_mul_abx_aby (
       .in_i  ( {mul_ax_ay_d,   mul_bx_by_d}   ),
       .out_o ( {mul_ax_ay_buf, mul_bx_by_buf} )
     );

     assign mul_ax_ay = mul_ax_ay_buf;
     assign mul_bx_by = mul_bx_by_buf;
   end

   /////////////////
   // Integration //
   /////////////////
   assign a_q = mul_ax_ay ^ aq_z0_q;
   assign b_q = mul_bx_by ^ bq_z0_q;

   // Only GF(2^4) and GF(2^2) is supported.
   `ASSERT_INIT(AesDomIndepMulPower, NPower == 4 || NPower == 2)

 endmodule

 // DOM-dep GF(2^N) multiplier, first-order masked.
 // Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
 // domain-oriented masking. The sharings of x and y are NOT required to be independent from each
 // other. This is the un-optimized version consuming 3 times N bits of randomness for blinding and
 // resharing. It is not used in the design but we keep it for reference.
 // See Fig. 4 and Formulas 8 - 11 in [1].
 module aes_dom_dep_mul_gf2pn_unopt #(
   parameter int unsigned NPower   = 4,
   parameter bit          Pipeline = 1'b0
 ) (
   input  logic              clk_i,
   input  logic              rst_ni,
   input  logic              we_i,
   input  logic [NPower-1:0] a_x,    // Share a of x
   input  logic [NPower-1:0] a_y,    // Share a of y
   input  logic [NPower-1:0] b_x,    // Share b of x
   input  logic [NPower-1:0] b_y,    // Share b of y
   input  logic [NPower-1:0] a_z,    // Randomness for blinding
   input  logic [NPower-1:0] b_z,    // Randomness for blinding
   input  logic [NPower-1:0] z_0,    // Randomness for resharing
   output logic [NPower-1:0] a_q,    // Share a of q
   output logic [NPower-1:0] b_q     // Share b of q
 );

   import aes_sbox_canright_pkg::*;

   //////////////
   // Blinding //
   //////////////
   // Blinding of y by z.
   logic [NPower-1:0] a_yz_d, b_yz_d;
   logic [NPower-1:0] a_yz_q, b_yz_q;
   assign a_yz_d = a_y ^ a_z;
   assign b_yz_d = b_y ^ b_z;

   // Registers
   prim_flop_en #(
     .Width      ( 2*NPower ),
     .ResetValue ( '0       )
   ) u_prim_flop_ab_yz (
     .clk_i  ( clk_i            ),
     .rst_ni ( rst_ni           ),
     .en_i   ( we_i             ),
     .d_i    ( {a_yz_d, b_yz_d} ),
     .q_o    ( {a_yz_q, b_yz_q} )
   );

   ////////////////
   // Correction //
   ////////////////
   logic [NPower-1:0] a_mul_x_z, b_mul_x_z;
   aes_dom_indep_mul_gf2pn #(
     .NPower   ( NPower   ),
     .Pipeline ( Pipeline )
   ) u_aes_dom_indep_mul_gf2pn (
     .clk_i  ( clk_i     ),
     .rst_ni ( rst_ni    ),
     .we_i   ( we_i      ),
     .a_x    ( a_x       ), // Share a of x
     .a_y    ( a_z       ), // Share a of z
     .b_x    ( b_x       ), // Share b of x
     .b_y    ( b_z       ), // Share b of z
     .z_0    ( z_0       ), // Randomness for resharing
     .a_q    ( a_mul_x_z ), // Share a of x * z
     .b_q    ( b_mul_x_z )  // Share b of x * z
   );

   /////////////////////////
   // Optional Pipelining //
   /////////////////////////
   logic [NPower-1:0] a_x_calc, b_x_calc;

   if (Pipeline == 1'b1) begin : gen_pipeline
     // Add pipeline registers for input x. This allows accepting new input data every clock cycle
     // and prevents SCA leakage occurring due to the multiplication of input x with b belonging to
     // different clock cycles.

     logic [NPower-1:0] a_x_q, b_x_q;
     prim_flop_en #(
       .Width      ( 2*NPower ),
       .ResetValue ( '0       )
     ) u_prim_flop_ab_x (
       .clk_i  ( clk_i          ),
       .rst_ni ( rst_ni         ),
       .en_i   ( we_i           ),
       .d_i    ( {a_x,   b_x}   ),
       .q_o    ( {a_x_q, b_x_q} )
     );

     assign a_x_calc = a_x_q;
     assign b_x_calc = b_x_q;

   end else begin : gen_no_pipeline
     // Do not add the optional pipeline registers for input x. This allows to save some area in
     // case the multiplier does not need to accept new data in every cycle. However, this can cause
     // SCA leakage as during the clock cycle in which new data arrives, the new x input is
     // multiplied with the previous b.

     assign a_x_calc = a_x;
     assign b_x_calc = b_x;
   end

   /////////////////
   // Calculation //
   /////////////////
   // Combine shares of blinded y to obtain b.
   logic [NPower-1:0] b;
   assign b = a_yz_q ^ b_yz_q;

   logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
   if (NPower == 4) begin : gen_mul_gf2p4
     assign a_mul_ax_b = aes_mul_gf2p4(a_x_calc, b);
     assign b_mul_bx_b = aes_mul_gf2p4(b_x_calc, b);

   end else begin : gen_mul_gf2p2
     assign a_mul_ax_b = aes_mul_gf2p2(a_x_calc, b);
     assign b_mul_bx_b = aes_mul_gf2p2(b_x_calc, b);
   end

   /////////////////
   // Integration //
   /////////////////
   assign a_q = a_mul_x_z ^ a_mul_ax_b;
   assign b_q = b_mul_x_z ^ b_mul_bx_b;

   // Only GF(2^4) and GF(2^2) is supported.
   `ASSERT_INIT(AesDomDepMulUnoptPower, NPower == 4 || NPower == 2)

 endmodule

 // DOM-dep GF(2^N) multiplier, first-order masked.
 // Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
 // domain-oriented masking. The sharings of x and y are NOT required to be independent from each
 // other. This is the optimized version consuming 2 instead of 3 times N bits of randomness for
 // blinding and resharing.
 // See Formula 12 in [1].
 module aes_dom_dep_mul_gf2pn #(
   parameter int unsigned NPower      = 4,
   parameter bit          Pipeline    = 1'b0,
   parameter bit          PreDomIndep = 1'b0 // 1'b0: Not followed by an un-pipelined DOM-indep
                                             //       multiplier, this enables additional area
                                             //       optimizations
                                             // 1'b1: Directly followed by an un-pipelined
                                             //       DOM-indep multiplier, this is the version
                                             //       discussed in [1].
 ) (
   input  logic                clk_i,
   input  logic                rst_ni,
   input  logic                we_i,
   input  logic   [NPower-1:0] a_x,    // Share a of x
   input  logic   [NPower-1:0] a_y,    // Share a of y
   input  logic   [NPower-1:0] b_x,    // Share b of x
   input  logic   [NPower-1:0] b_y,    // Share b of y
   input  logic   [NPower-1:0] a_x_q,  // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
   input  logic   [NPower-1:0] a_y_q,  // Share a of y, pipelined (for Pipeline=1)
   input  logic   [NPower-1:0] b_x_q,  // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
   input  logic   [NPower-1:0] b_y_q,  // Share b of y, pipelined (for Pipeline=1)
   input  logic   [NPower-1:0] z_0,    // Randomness for blinding
   input  logic   [NPower-1:0] z_1,    // Randomness for resharing
   output logic   [NPower-1:0] a_q,    // Share a of q
   output logic   [NPower-1:0] b_q,    // Share b of q
   output logic [2*NPower-1:0] prd_o   // Randomness for use in another S-Box instance
 );

   import aes_sbox_canright_pkg::*;

   //////////////
   // Blinding //
   //////////////
   // Blinding of y by z_0.
   logic [NPower-1:0] a_yz0_d, b_yz0_d;
   logic [NPower-1:0] a_yz0_q, b_yz0_q;
   assign a_yz0_d = a_y ^ z_0;
   assign b_yz0_d = b_y ^ z_0;

   // Registers
   prim_flop_en #(
     .Width      ( 2*NPower ),
     .ResetValue ( '0       )
   ) u_prim_flop_ab_yz0 (
     .clk_i  ( clk_i              ),
     .rst_ni ( rst_ni             ),
     .en_i   ( we_i               ),
     .d_i    ( {a_yz0_d, b_yz0_d} ),
     .q_o    ( {a_yz0_q, b_yz0_q} )
   );

   ////////////////
   // Correction //
   ////////////////
   // Basically, this a DOM-indep multiplier with:
   // - a_x = a_x, b_x = b_x, and
   // - a_y = z_0, b_y = 0 (constant),
   // which allows for further optimizations.

   // Calculation
   logic [NPower-1:0] mul_ax_z0, mul_bx_z0;
   if (NPower == 4) begin : gen_corr_mul_gf2p4
     assign mul_ax_z0 = aes_mul_gf2p4(a_x, z_0);
     assign mul_bx_z0 = aes_mul_gf2p4(b_x, z_0);

   end else begin : gen_corr_mul_gf2p2
     assign mul_ax_z0 = aes_mul_gf2p2(a_x, z_0);
     assign mul_bx_z0 = aes_mul_gf2p2(b_x, z_0);
   end

   // Avoid aggressive synthesis optimizations.
   logic [NPower-1:0] mul_ax_z0_buf, mul_bx_z0_buf;
   prim_buf #(
     .Width ( 2*NPower )
   ) u_prim_buf_mul_abx_z0 (
     .in_i  ( {mul_ax_z0,     mul_bx_z0}     ),
     .out_o ( {mul_ax_z0_buf, mul_bx_z0_buf} )
   );

   // Resharing
   logic [NPower-1:0] axz0_z1_d, bxz0_z1_d;
   logic [NPower-1:0] axz0_z1_q, bxz0_z1_q;
   assign axz0_z1_d = mul_ax_z0_buf ^ z_1;
   assign bxz0_z1_d = mul_bx_z0_buf ^ z_1;

   // Registers
   prim_flop_en #(
     .Width      ( 2*NPower ),
     .ResetValue ( '0       )
   ) u_prim_flop_abxz0_z1 (
     .clk_i  ( clk_i                  ),
     .rst_ni ( rst_ni                 ),
     .en_i   ( we_i                   ),
     .d_i    ( {axz0_z1_d, bxz0_z1_d} ),
     .q_o    ( {axz0_z1_q, bxz0_z1_q} )
   );

   // Use intermediate results for generating PRD for another S-Box instance.
   // Use one share only. Directly use output of flops updating with we_i.
   // These intermediate results are obtained by remasking b_y and mul_bx_z0 with z_0 and z_1,
   // respectively. Since z_0/1 are uniformly distributed and independent of b_y and mul_bx_z0,
   // the intermediate results are also uniformly distributed and independent of b_y and mul_bx_z0.
   // For details, see Lemma 1 in [2].
   assign prd_o = {b_yz0_q, bxz0_z1_q};

   /////////////////////////
   // Optional Pipelining //
   /////////////////////////
   logic [NPower-1:0] a_x_calc, b_x_calc, a_y_calc, b_y_calc;

   if (Pipeline == 1'b1 && PreDomIndep != 1'b1) begin : gen_pipeline_use
     // Use pipelined inputs x and y. This allows accepting new input data every clock cycle and
     // prevents SCA leakage occurring due to the multiplication of inputs x and y with d_b
     // belonging to different clock cycles.
     //
     // The PreDomIndep variant uses the pipelined inputs directly.

     assign a_x_calc = a_x_q;
     assign b_x_calc = b_x_q;
     assign a_y_calc = a_y_q;
     assign b_y_calc = b_y_q;

   end else begin : gen_no_pipeline_use
     // Do not use pipelined inputs x and y. This allows to save some area in case the multiplier
     // does not need to accept new data in every cycle. However, this can cause SCA leakage as
     // during the clock cycle in which new data arrives, the new x and y inputs are multiplied
     // with the previous d_b.

     assign a_x_calc = a_x;
     assign b_x_calc = b_x;
     assign a_y_calc = a_y;
     assign b_y_calc = b_y;

     // Tie off unused signals.
     if (PreDomIndep != 1'b1) begin : gen_ab_x_q
       logic [NPower-1:0] unused_a_x_q, unused_b_x_q;
       assign unused_a_x_q = a_x_q;
       assign unused_b_x_q = b_x_q;
     end
     logic [NPower-1:0] unused_a_y_q, unused_b_y_q;
     assign unused_a_y_q = a_y_q;
     assign unused_b_y_q = b_y_q;
   end

   ///////////////////////////////
   // Calculation & Integration //
   ///////////////////////////////
   // Compute b. Note that unlike for the unoptimized implementation, we don't combine the blinded
   // shares of y to obtain a single b value. Intstead, every domain d gets its own version of b:
   //
   //   d_b = d_y ^ _D_y_z0
   //
   // where _D_y_z0 corresponds to the sum of all domains of y except for domain d, each
   // individually blinded by z0 (needs to happen before the register bank). This optimization
   // is only suitable for first-order masking.
   // See Formula 12 in [1].

   if (PreDomIndep == 1'b1) begin : gen_pre_dom_indep
     // This DOM-dep multiplier is directly followed by an un-pipelined DOM-indep multiplier. To
     // prevent SCA leakage in the un-pipelined DOM-indep multiplier, the d_y and _D_y_z0 parts of
     // d_b need to be individually multiplied with input x and then the results need to be
     // integrated (summed up) on a per-domain basis.

     // d_y part: Inner-domain terms of x * y
     logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
     logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
     if (NPower == 4) begin : gen_inner_mul_gf2p4
       assign mul_ax_ay_d = aes_mul_gf2p4(a_x_calc, a_y_calc);
       assign mul_bx_by_d = aes_mul_gf2p4(b_x_calc, b_y_calc);

     end else begin : gen_inner_mul_gf2p2
       assign mul_ax_ay_d = aes_mul_gf2p2(a_x_calc, a_y_calc);
       assign mul_bx_by_d = aes_mul_gf2p2(b_x_calc, b_y_calc);
     end

     // Registers
     prim_flop_en #(
       .Width      ( 2*NPower ),
       .ResetValue ( '0       )
     ) u_prim_flop_mul_abx_aby (
       .clk_i  ( clk_i                      ),
       .rst_ni ( rst_ni                     ),
       .en_i   ( we_i                       ),
       .d_i    ( {mul_ax_ay_d, mul_bx_by_d} ),
       .q_o    ( {mul_ax_ay_q, mul_bx_by_q} )
     );

     // _D_y_z0 part: Cross-domain terms: d_x * _D_y_z0
     // Need to use registered version of input x.
     logic [NPower-1:0] mul_ax_byz0, mul_bx_ayz0;
     if (NPower == 4) begin : gen_cross_mul_gf2p4
       assign mul_ax_byz0 = aes_mul_gf2p4(a_x_q, b_yz0_q);
       assign mul_bx_ayz0 = aes_mul_gf2p4(b_x_q, a_yz0_q);

     end else begin : gen_cross_mul_gf2p2
       assign mul_ax_byz0 = aes_mul_gf2p2(a_x_q, b_yz0_q);
       assign mul_bx_ayz0 = aes_mul_gf2p2(b_x_q, a_yz0_q);
     end

     // Avoid aggressive synthesis optimizations.
     logic [NPower-1:0] mul_ax_byz0_buf, mul_bx_ayz0_buf;
     prim_buf #(
       .Width ( 2*NPower )
     ) u_prim_buf_mul_abx_bayz0 (
       .in_i  ( {mul_ax_byz0,     mul_bx_ayz0}     ),
       .out_o ( {mul_ax_byz0_buf, mul_bx_ayz0_buf} )
     );

     // Integration
     assign a_q = axz0_z1_q ^ mul_ax_ay_q ^ mul_ax_byz0_buf;
     assign b_q = bxz0_z1_q ^ mul_bx_by_q ^ mul_bx_ayz0_buf;

   end else begin : gen_not_pre_dom_indep
     // This DOM-dep multiplier is not directly followed by an un-pipelined DOM-indep multiplier. As
     // a result, the the d_y and _D_y_z0 parts of d_b can be summed up prior to the multiplication
     // with input x which allows saving 2 GF multipliers.

     // Sum up d_y and _D_y_z0.
     logic [NPower-1:0] a_b, b_b;
     assign a_b = a_y_calc ^ b_yz0_q;
     assign b_b = b_y_calc ^ a_yz0_q;

     // Avoid aggressive synthesis optimizations.
     logic [NPower-1:0] a_b_buf, b_b_buf;
     prim_buf #(
       .Width ( 2*NPower )
     ) u_prim_buf_ab_b (
       .in_i  ( {a_b,     b_b}     ),
       .out_o ( {a_b_buf, b_b_buf} )
     );

     // GF multiplications
     logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
     if (NPower == 4) begin : gen_mul_gf2p4
       assign a_mul_ax_b = aes_mul_gf2p4(a_x_calc, a_b_buf);
       assign b_mul_bx_b = aes_mul_gf2p4(b_x_calc, b_b_buf);
     end else begin : gen_mul_gf2p2
       assign a_mul_ax_b = aes_mul_gf2p2(a_x_calc, a_b_buf);
       assign b_mul_bx_b = aes_mul_gf2p2(b_x_calc, b_b_buf);
     end

     // Avoid aggressive synthesis optimizations.
     logic [NPower-1:0] a_mul_ax_b_buf, b_mul_bx_b_buf;
     prim_buf #(
       .Width ( 2*NPower )
     ) u_prim_buf_ab_mul_abx_b (
       .in_i  ( {a_mul_ax_b,     b_mul_bx_b}     ),
       .out_o ( {a_mul_ax_b_buf, b_mul_bx_b_buf} )
     );

     // Integration
     assign a_q = axz0_z1_q ^ a_mul_ax_b_buf;
     assign b_q = bxz0_z1_q ^ b_mul_bx_b_buf;
   end

   // Only GF(2^4) and GF(2^2) is supported.
   `ASSERT_INIT(AesDomDepMulPower, NPower == 4 || NPower == 2)

 endmodule

 // Inverse in GF(2^4) using first-order domain-oriented masking and normal basis [z^4, z].
 // See Fig. 6 in [2] (grey block, Stages 2 and 3) and Formulas 6, 13, 14, 15, 16, 17 in [2].
 module aes_dom_inverse_gf2p4 #(
   parameter bit PipelineMul = 1'b1
 ) (
   input  logic        clk_i,
   input  logic        rst_ni,
   input  logic  [1:0] we_i,
   input  logic  [3:0] a_gamma,
   input  logic  [3:0] b_gamma,
   input  logic  [3:0] prd_2_i,
   input  logic  [7:0] prd_3_i,
   output logic  [3:0] a_gamma_inv,
   output logic  [3:0] b_gamma_inv,
   output logic  [7:0] prd_2_o,
   output logic  [7:0] prd_3_o
 );

   import aes_sbox_canright_pkg::*;

   /////////////
   // Stage 2 //
   /////////////
   // Formula 13 in [2].

   logic [1:0] a_gamma1, a_gamma0, b_gamma1, b_gamma0, a_gamma1_gamma0, b_gamma1_gamma0;
   assign a_gamma1 = a_gamma[3:2];
   assign a_gamma0 = a_gamma[1:0];
   assign b_gamma1 = b_gamma[3:2];
   assign b_gamma0 = b_gamma[1:0];

   logic [1:0] a_gamma_ss_d, b_gamma_ss_d;
   logic [1:0] a_gamma_ss_q, b_gamma_ss_q;
   assign a_gamma_ss_d = aes_scale_omega2_gf2p2(aes_square_gf2p2(a_gamma1 ^ a_gamma0));
   assign b_gamma_ss_d = aes_scale_omega2_gf2p2(aes_square_gf2p2(b_gamma1 ^ b_gamma0));
   prim_flop_en #(
     .Width      ( 4  ),
     .ResetValue ( '0 )
   ) u_prim_flop_ab_gamma_ss (
     .clk_i  ( clk_i                        ),
     .rst_ni ( rst_ni                       ),
     .en_i   ( we_i[0]                      ),
     .d_i    ( {a_gamma_ss_d, b_gamma_ss_d} ),
     .q_o    ( {a_gamma_ss_q, b_gamma_ss_q} )
   );

   logic [1:0] a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q;
   prim_flop_en #(
     .Width      ( 8  ),
     .ResetValue ( '0 )
   ) u_prim_flop_ab_gamma10 (
     .clk_i  ( clk_i                                            ),
     .rst_ni ( rst_ni                                           ),
     .en_i   ( we_i[0]                                          ),
     .d_i    ( {a_gamma1,   a_gamma0,   b_gamma1,   b_gamma0}   ),
     .q_o    ( {a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q} )
   );

   logic [3:0] b_gamma10_prd2;
   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 2           ),
     .Pipeline    ( PipelineMul ),
     .PreDomIndep ( 1'b0        )
   ) u_aes_dom_mul_gamma1_gamma0 (
     .clk_i  ( clk_i           ),
     .rst_ni ( rst_ni          ),
     .we_i   ( we_i[0]         ),
     .a_x    ( a_gamma1        ), // Share a of x
     .a_y    ( a_gamma0        ), // Share a of y
     .b_x    ( b_gamma1        ), // Share b of x
     .b_y    ( b_gamma0        ), // Share b of y
     .a_x_q  ( a_gamma1_q      ), // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
     .a_y_q  ( a_gamma0_q      ), // Share a of y, pipelined (for Pipeline=1)
     .b_x_q  ( b_gamma1_q      ), // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
     .b_y_q  ( b_gamma0_q      ), // Share b of y, pipelined (for Pipeline=1)
     .z_0    ( prd_2_i[1:0]    ), // Randomness for blinding
     .z_1    ( prd_2_i[3:2]    ), // Randomness for resharing
     .a_q    ( a_gamma1_gamma0 ), // Share a of q
     .b_q    ( b_gamma1_gamma0 ), // Share b of q
     .prd_o  ( b_gamma10_prd2  )  // Randomness for use in another S-Box instance
   );

   // Use intermediate results for generating PRD for Stage 3 of another S-Box instance.
   // Use one share only. Directly use output of flops updating with we_i[0].
   // b_gamma10_prd2 is based on b_gamma1_q, b_gamma0_q but XORed with prd_2_i, thus uniformly
   // distributed and independent of b_gamma1/0_q (See Lemma 1 in [2]).
   //
   // In Stage 3 of another S-Box instance, the MSBs and LSBs of the term below are used:
   // 1. as randomness in the DOM-dep multipliers u_aes_dom_mul_omega_gamma1/0, and
   // 2. to generate randomness for the DOM-indep multipliers u_aes_dom_mul_theta_y1/0 in Stage 4 of
   //    yet another S-Box instance, respectively.
   // Without interleaving b_gamma1/0_q as well as the upper and lower halves of b_gamma10_prd2 here,
   // a glitch on the write-enable signal on the input pipeline register of these DOM-indep
   // multipliers may result in undesirable SCA leakage.
   assign prd_2_o = {b_gamma1_q, b_gamma10_prd2[3:2], b_gamma0_q, b_gamma10_prd2[1:0]};

   /////////////
   // Stage 3 //
   /////////////

   // Formulas 14 and 15 in [2].
   logic [1:0] a_omega, b_omega;
   assign a_omega = aes_square_gf2p2(a_gamma1_gamma0 ^ a_gamma_ss_q);
   assign b_omega = aes_square_gf2p2(b_gamma1_gamma0 ^ b_gamma_ss_q);

   // Avoid aggressive synthesis optimizations.
   logic [1:0] a_omega_buf, b_omega_buf;
   prim_buf #(
     .Width ( 4 )
   ) u_prim_buf_ab_omega (
     .in_i  ( {a_omega,     b_omega}     ),
     .out_o ( {a_omega_buf, b_omega_buf} )
   );

   // Pipeline registers
   logic [1:0] a_gamma1_qq, a_gamma0_qq, b_gamma1_qq, b_gamma0_qq, a_omega_buf_q, b_omega_buf_q;
   if (PipelineMul == 1'b1) begin: gen_prim_flop_omega_gamma10
     // We instantiate the input pipeline registers for the DOM-dep multiplier outside of the
     // multiplier to enable sharing of pipeline registers where applicable.

     prim_flop_en #(
       .Width      ( 8  ),
       .ResetValue ( '0 )
     ) u_prim_flop_ab_gamma10_q (
       .clk_i  ( clk_i                                                ),
       .rst_ni ( rst_ni                                               ),
       .en_i   ( we_i[1]                                              ),
       .d_i    ( {a_gamma1_q,  a_gamma0_q,  b_gamma1_q,  b_gamma0_q}  ),
       .q_o    ( {a_gamma1_qq, a_gamma0_qq, b_gamma1_qq, b_gamma0_qq} )
     );

     // These inputs are used by both DOM-dep multipliers below.
     prim_flop_en #(
       .Width      ( 4  ),
       .ResetValue ( '0 )
     ) u_prim_flop_ab_omega_buf (
       .clk_i  ( clk_i                          ),
       .rst_ni ( rst_ni                         ),
       .en_i   ( we_i[1]                        ),
       .d_i    ( {a_omega_buf,   b_omega_buf}   ),
       .q_o    ( {a_omega_buf_q, b_omega_buf_q} )
     );

   end else begin : gen_no_prim_flop_ab_y10
     // When using un-pipelined multipliers, there is no need to insert additional registers.
     // We drive the corresponding inputs to 0 to make sure the functionality isn't correct in case
     // the pipeliend inputs are erroneously used.

     assign a_gamma1_qq = '0;
     assign a_gamma0_qq = '0;
     assign b_gamma1_qq = '0;
     assign b_gamma0_qq = '0;
     assign a_omega_buf_q = '0;
     assign b_omega_buf_q = '0;
   end

   // Formulas 16 and 17 in [2].
   logic [3:0] b_gamma1_omega_prd3;
   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 2           ),
     .Pipeline    ( PipelineMul ),
     .PreDomIndep ( 1'b0        )
   ) u_aes_dom_mul_omega_gamma1 (
     .clk_i  ( clk_i               ),
     .rst_ni ( rst_ni              ),
     .we_i   ( we_i[1]             ),
     .a_x    ( a_gamma1_q          ), // Share a of x
     .a_y    ( a_omega_buf         ), // Share a of y
     .b_x    ( b_gamma1_q          ), // Share b of x
     .b_y    ( b_omega_buf         ), // Share b of y
     .a_x_q  ( a_gamma1_qq         ), // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
     .a_y_q  ( a_omega_buf_q       ), // Share a of y, pipelined (for Pipeline=1)
     .b_x_q  ( b_gamma1_qq         ), // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
     .b_y_q  ( b_omega_buf_q       ), // Share b of y, pipelined (for Pipeline=1)
     .z_0    ( prd_3_i[5:4]        ), // Randomness for blinding
     .z_1    ( prd_3_i[7:6]        ), // Randomness for resharing
     .a_q    ( a_gamma_inv[1:0]    ), // Share a of q
     .b_q    ( b_gamma_inv[1:0]    ), // Share b of q
     .prd_o  ( b_gamma1_omega_prd3 )  // Randomness for use in another S-Box instance
   );

   logic [3:0] b_gamma0_omega_prd3;
   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 2           ),
     .Pipeline    ( PipelineMul ),
     .PreDomIndep ( 1'b0        )
   ) u_aes_dom_mul_omega_gamma0 (
     .clk_i  ( clk_i               ),
     .rst_ni ( rst_ni              ),
     .we_i   ( we_i[1]             ),
     .a_x    ( a_omega_buf         ), // Share a of x
     .a_y    ( a_gamma0_q          ), // Share a of y
     .b_x    ( b_omega_buf         ), // Share b of x
     .b_y    ( b_gamma0_q          ), // Share b of y
     .a_x_q  ( a_omega_buf_q       ), // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
     .a_y_q  ( a_gamma0_qq         ), // Share a of y, pipelined (for Pipeline=1)
     .b_x_q  ( b_omega_buf_q       ), // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
     .b_y_q  ( b_gamma0_qq         ), // Share b of y, pipelined (for Pipeline=1)
     .z_0    ( prd_3_i[1:0]        ), // Randomness for blinding
     .z_1    ( prd_3_i[3:2]        ), // Randomness for resharing
     .a_q    ( a_gamma_inv[3:2]    ), // Share a of q
     .b_q    ( b_gamma_inv[3:2]    ), // Share b of q
     .prd_o  ( b_gamma0_omega_prd3 )  // Randomness for use in another S-Box instance
   );

   // Use intermediate results for generating PRD for Stage 4 of another S-Box instance.
   // Use one share only. Directly use output of flops updating with we_i[1].
   // b_gamma1/0_omega_prd3 are both based on b_omega but XORed with differend parts of prd_3_i,
   // thus uniformly distributed and independent of b_omega (see Lemma 1 in [2]).
   assign prd_3_o = {b_gamma1_omega_prd3, b_gamma0_omega_prd3};

 endmodule

 // Inverse in GF(2^8) using first-order domain-oriented masking and normal basis [y^16, y].
 // See Fig. 6 in [1] and Formulas 3, 12, 18 and 19 in [2].
 module aes_dom_inverse_gf2p8 #(
   parameter bit PipelineMul = 1'b1
 ) (
   input  logic        clk_i,
   input  logic        rst_ni,
   input  logic  [3:0] we_i,
   input  logic  [7:0] a_y,     // input data masked by b_y
   input  logic  [7:0] b_y,     // input mask
   input  prd_in_t     prd_i,   // pseudo-random data, e.g. for intermediate masks
   output logic  [7:0] a_y_inv, // output data masked by b_y_inv
   output logic  [7:0] b_y_inv, // output mask
   output prd_out_t    prd_o    // pseudo-random data, e.g. for use in another S-Box instance
 );

   import aes_sbox_canright_pkg::*;

   /////////////
   // Stage 1 //
   /////////////
   // Formula 12 in [2].

   logic [3:0] a_y1, a_y0, b_y1, b_y0, a_y1_y0, b_y1_y0;
   assign a_y1 = a_y[7:4];
   assign a_y0 = a_y[3:0];
   assign b_y1 = b_y[7:4];
   assign b_y0 = b_y[3:0];

   logic [3:0] a_y_ss_d, b_y_ss_d;
   logic [3:0] a_y_ss_q, b_y_ss_q;
   assign a_y_ss_d = aes_square_scale_gf2p4_gf2p2(a_y1 ^ a_y0);
   assign b_y_ss_d = aes_square_scale_gf2p4_gf2p2(b_y1 ^ b_y0);
   prim_flop_en #(
     .Width      ( 8  ),
     .ResetValue ( '0 )
   ) u_prim_flop_ab_y_ss (
     .clk_i  ( clk_i                ),
     .rst_ni ( rst_ni               ),
     .en_i   ( we_i[0]              ),
     .d_i    ( {a_y_ss_d, b_y_ss_d} ),
     .q_o    ( {a_y_ss_q, b_y_ss_q} )
   );

   logic [3:0] a_y1_q, a_y0_q, b_y1_q, b_y0_q;
   if (PipelineMul == 1'b1) begin: gen_prim_flop_ab_y10
     // We instantiate the input pipeline registers for the DOM-dep multiplier outside of the
     // multiplier to enable sharing of pipeline registers where applicable.

     prim_flop_en #(
       .Width      ( 16  ),
       .ResetValue ( '0  )
     ) u_prim_flop_ab_y10 (
       .clk_i  ( clk_i                            ),
       .rst_ni ( rst_ni                           ),
       .en_i   ( we_i[0]                          ),
       .d_i    ( {a_y1,   a_y0,   b_y1,   b_y0}   ),
       .q_o    ( {a_y1_q, a_y0_q, b_y1_q, b_y0_q} )
     );

   end else begin : gen_no_prim_flop_ab_y10
     // When using un-pipelined multipliers, there is no need to insert additional registers.
     // We drive the corresponding inputs to 0 to make sure the functionality isn't correct in case
     // the pipeliend inputs are erroneously used.

     assign a_y1_q = '0;
     assign a_y0_q = '0;
     assign b_y1_q = '0;
     assign b_y0_q = '0;
   end

   logic [7:0] b_y10_prd1;
   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 4           ),
     .Pipeline    ( PipelineMul ),
     .PreDomIndep ( 1'b0        )
   ) u_aes_dom_mul_y1_y0 (
     .clk_i  ( clk_i            ),
     .rst_ni ( rst_ni           ),
     .we_i   ( we_i[0]          ),
     .a_x    ( a_y1             ), // Share a of x
     .a_y    ( a_y0             ), // Share a of y
     .b_x    ( b_y1             ), // Share b of x
     .b_y    ( b_y0             ), // Share b of y
     .a_x_q  ( a_y1_q           ), // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
     .a_y_q  ( a_y0_q           ), // Share a of y, pipelined (for Pipeline=1)
     .b_x_q  ( b_y1_q           ), // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
     .b_y_q  ( b_y0_q           ), // Share b of y, pipelined (for Pipeline=1)
     .z_0    ( prd_i.prd_1[3:0] ), // Randomness for blinding
     .z_1    ( prd_i.prd_1[7:4] ), // Randomness for resharing
     .a_q    ( a_y1_y0          ), // Share a of q
     .b_q    ( b_y1_y0          ), // Share b of q
     .prd_o  ( b_y10_prd1       )  // Randomness for use in another S-Box instance
   );

   logic [3:0] a_gamma, b_gamma;
   assign a_gamma = a_y_ss_q ^ a_y1_y0;
   assign b_gamma = b_y_ss_q ^ b_y1_y0;

   // Avoid aggressive synthesis optimizations.
   logic [3:0] a_gamma_buf, b_gamma_buf;
   prim_buf #(
     .Width ( 8 )
   ) u_prim_buf_ab_gamma (
     .in_i  ( {a_gamma,     b_gamma}     ),
     .out_o ( {a_gamma_buf, b_gamma_buf} )
   );

   // Use intermediate results for generating PRD for Stage 2 of another S-Box instance.
   // Use one share only. Directly use output of flops updating with we_i[0].
   // b_y10_prd1 is based on b_y and XORed with prd_1. We just use the lower part involving a
   // non-linear element.
   assign prd_o.prd_1 = b_y10_prd1[3:0];
   logic [3:0] unused_prd;
   assign unused_prd  = b_y10_prd1[7:4];

   ////////////////////
   // Stages 2 and 3 //
   ////////////////////

   logic [3:0] a_theta, b_theta;

   // a_gamma is masked by b_gamma, a_gamma_inv is masked by b_gamma_inv.
   aes_dom_inverse_gf2p4 #(
     .PipelineMul ( PipelineMul )
   ) u_aes_dom_inverse_gf2p4 (
     .clk_i       ( clk_i       ),
     .rst_ni      ( rst_ni      ),
     .we_i        ( we_i[2:1]   ),
     .a_gamma     ( a_gamma_buf ),
     .b_gamma     ( b_gamma_buf ),
     .prd_2_i     ( prd_i.prd_2 ),
     .prd_3_i     ( prd_i.prd_3 ),
     .a_gamma_inv ( a_theta     ),
     .b_gamma_inv ( b_theta     ),
     .prd_2_o     ( prd_o.prd_2 ),
     .prd_3_o     ( prd_o.prd_3 )
   );

   /////////////
   // Stage 4 //
   /////////////
   // Formulas 18 and 19 in [2].

   logic [3:0] a_y1_qqq, a_y0_qqq, b_y1_qqq, b_y0_qqq;
   prim_flop_en #(
     .Width      ( 16 ),
     .ResetValue ( '0 )
   ) u_prim_flop_ab_y10_qqq (
     .clk_i  ( clk_i                                    ),
     .rst_ni ( rst_ni                                   ),
     .en_i   ( we_i[2]                                  ),
     .d_i    ( {a_y1,     a_y0,     b_y1,     b_y0}     ),
     .q_o    ( {a_y1_qqq, a_y0_qqq, b_y1_qqq, b_y0_qqq} )
   );

   aes_dom_indep_mul_gf2pn #(
     .NPower   ( 4           ),
     .Pipeline ( PipelineMul )
   ) u_aes_dom_mul_theta_y1 (
     .clk_i  ( clk_i            ),
     .rst_ni ( rst_ni           ),
     .we_i   ( we_i[3]          ),
     .a_x    ( a_y1_qqq         ), // Share a of x
     .a_y    ( a_theta          ), // Share a of y
     .b_x    ( b_y1_qqq         ), // Share b of x
     .b_y    ( b_theta          ), // Share b of y
     .z_0    ( prd_i.prd_4[7:4] ), // Randomness for resharing
     .a_q    ( a_y_inv[3:0]     ), // Share a of q
     .b_q    ( b_y_inv[3:0]     )  // Share b of q
   );

   aes_dom_indep_mul_gf2pn #(
     .NPower   ( 4           ),
     .Pipeline ( PipelineMul )
   ) u_aes_dom_mul_theta_y0 (
     .clk_i  ( clk_i            ),
     .rst_ni ( rst_ni           ),
     .we_i   ( we_i[3]          ),
     .a_x    ( a_theta          ), // Share a of x
     .a_y    ( a_y0_qqq         ), // Share a of y
     .b_x    ( b_theta          ), // Share b of x
     .b_y    ( b_y0_qqq         ), // Share b of y
     .z_0    ( prd_i.prd_4[3:0] ), // Randomness for resharing
     .a_q    ( a_y_inv[7:4]     ), // Share a of q
     .b_q    ( b_y_inv[7:4]     )  // Share b of q
   );

 endmodule

 // SEC_CM: KEY.MASKING
 module aes_sbox_dom
 #(
   parameter bit PipelineMul = 1'b1
 ) (
   input  logic              clk_i,
   input  logic              rst_ni,
   input  logic              en_i,
   output logic              out_req_o,
   input  logic              out_ack_i,
   input  aes_pkg::ciph_op_e op_i,
   input  logic        [7:0] data_i, // masked, the actual input data is data_i ^ mask_i
   input  logic        [7:0] mask_i, // input mask
   input  logic       [27:0] prd_i,  // pseudo-random data for remasking, in total we need 28 bits
                                     // of PRD per evaluation, but at most 8 bits per cycle
   output logic        [7:0] data_o, // masked, the actual output data is data_o ^ mask_o
   output logic        [7:0] mask_o, // output mask
   output logic       [19:0] prd_o   // PRD for usage in Stages 2 - 4 of other S-Box instances
 );

   import aes_pkg::*;
   import aes_sbox_canright_pkg::*;

   logic [7:0] in_data_basis_x, out_data_basis_x;
   logic [7:0] in_mask_basis_x, out_mask_basis_x;
   logic [3:0] we;
   logic [7:0] prd1_d, prd1_q;
   prd_in_t    in_prd;
   prd_out_t   out_prd;

   // Convert data to normal basis X.
   assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X)         :
                            (op_i == CIPH_INV) ? aes_mvm(data_i ^ 8'h63, S2X) :
                                                 aes_mvm(data_i, A2X);

   // Convert mask to normal basis X.
   // The addition of constant 8'h63 prior to the affine transformation is skipped.
   assign in_mask_basis_x = (op_i == CIPH_FWD) ? aes_mvm(mask_i, A2X) :
                            (op_i == CIPH_INV) ? aes_mvm(mask_i, S2X) :
                                                 aes_mvm(mask_i, A2X);

   // Do the inversion in normal basis X.
   aes_dom_inverse_gf2p8 #(
     .PipelineMul ( PipelineMul )
   ) u_aes_dom_inverse_gf2p8 (
     .clk_i   ( clk_i            ),
     .rst_ni  ( rst_ni           ),
     .we_i    ( we               ),
     .a_y     ( in_data_basis_x  ), // input
     .b_y     ( in_mask_basis_x  ), // input
     .prd_i   ( in_prd           ), // input
     .a_y_inv ( out_data_basis_x ), // output
     .b_y_inv ( out_mask_basis_x ), // output
     .prd_o   ( out_prd          )  // output
   );

   // Convert data to basis S or A.
   assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
                   (op_i == CIPH_INV) ? (aes_mvm(out_data_basis_x, X2A))         :
                                        (aes_mvm(out_data_basis_x, X2S) ^ 8'h63);

   // Convert mask to basis S or A.
   // The addition of constant 8'h63 following the affine transformation is skipped.
   assign mask_o = (op_i == CIPH_FWD) ? aes_mvm(out_mask_basis_x, X2S) :
                   (op_i == CIPH_INV) ? aes_mvm(out_mask_basis_x, X2A) :
                                        aes_mvm(out_mask_basis_x, X2S);

   // Counter register
   logic [2:0] count_d, count_q;
   assign count_d = (out_req_o && out_ack_i) ? '0             :
                    out_req_o                ? count_q        :
                    en_i                     ? count_q + 3'd1 : count_q;
   always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
     if (!rst_ni) begin
       count_q <= '0;
     end else begin
       count_q <= count_d;
     end
   end
   assign out_req_o = en_i & count_q == 3'd4;

   // Write enable signals for internal registers
   assign we[0] = en_i & count_q == 3'd0;
   assign we[1] = en_i & count_q == 3'd1;
   assign we[2] = en_i & count_q == 3'd2;
   assign we[3] = en_i & count_q == 3'd3;

   // Buffer and forward PRD for the individual stages. We get 8 bits from the PRNG for usage in the
   // first cycle. Stages 2, 3 and 4 are driven by other S-Box instances.
   assign prd1_d = we[0] ? prd_i[7:0] : prd1_q;
   prim_flop #(
     .Width      ( 8  ),
     .ResetValue ( '0 )
   ) u_prim_flop_prd1_q (
     .clk_i  ( clk_i  ),
     .rst_ni ( rst_ni ),
     .d_i    ( prd1_d ),
     .q_o    ( prd1_q )
   );
   assign in_prd = '{prd_1: prd1_d,
                     prd_2: prd_i[11:8],
                     prd_3: prd_i[19:12],
                     prd_4: prd_i[27:20]};
   assign prd_o = {out_prd.prd_3, out_prd.prd_2, out_prd.prd_1};

 endmodule