hw/ip/aes/rtl/aes_sbox_dom.sv - 3p/lowrisc/opentitan - Git at Google

 // Copyright lowRISC contributors.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
 // AES S-Box with First-Order Domain-Oriented Masking
 //
 // This is the unpipelined version using DOM-dep multipliers. It has a latency of 5 clock cycles
 // and requires 28 bits of pseudo-random data per evaluation. Pipelining would only be beneficial
 // when using
 // - either a cipher core architecture with a data path smaller than 128 bit, i.e., where the
 //   individual S-Boxes are evaluated more than once per round, or
 // - a fully unrolled cipher core architecture for maximum throughput.
 //
 // Note: The DOM AES S-Box is built on top of the Canright masked S-Box without mask re-use.
 //
 // For details, see the following papers and reports:
 // [1] Gross, "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary
 //     Protection Order" available at https://eprint.iacr.org/2016/486.pdf
 // [2] Canright, "A very compact 'perfectly masked' S-box for AES (corrected)" available at
 //     https://eprint.iacr.org/2009/011.pdf
 // [3] Canright, "A very compact Rijndael S-box" available at https://hdl.handle.net/10945/25608
 //
 // Using the Coco-Alma tool in transient mode, this implementation has been formally verified to be
 // secure against first-order side-channel analysis (SCA). For more information on the tool,
 // refer to the following papers:
 // [4] Gigerl, "COCO: Co-design and co-verification of masked software implementations on CPUs"
 //     available at https://eprint.iacr.org/2020/1294.pdf
 // [5] Bloem, "Formal verification of masked hardware implementations in the presence of glitches"
 //     available at https://eprint.iacr.org/2017/897.pdf

 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // IMPORTANT NOTE:                                                                               //
 //                            DO NOT USE THIS FOR SYNTHESIS BLINDLY!                             //
 //                                                                                               //
 // This implementation relies on primitive cells like prim_buf/flop_en containing tool-specific  //
 // synthesis attributes to prevent the synthesis tool from optimizing away/re-ordering registers //
 // and to enforce the correct ordering of operations. Without the proper primitives, synthesis   //
 // tools might heavily optimize the design. The result is likely insecure. Use with care.        //
 ///////////////////////////////////////////////////////////////////////////////////////////////////

 `include "prim_assert.sv"

 // Packed struct for pseudo-random data (PRD) distribution. Stages 1, 3 and 4 require 8 bits each.
 // Stage 2 requires just 4 bits.
 typedef struct packed {
   logic [7:0] prd_1;
   logic [3:0] prd_2;
   logic [7:0] prd_3;
   logic [7:0] prd_4;
 } prd_t;

 // DOM-indep GF(2^N) multiplier, first-order masked.
 // Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
 // domain-oriented masking. The sharings of x and y are required to be uniformly random and
 // independent from each other.
 // See Fig. 2 in [1].
 module aes_dom_indep_mul_gf2pn #(
   parameter int unsigned NPower   = 4,
   parameter bit          Pipeline = 1'b0
 ) (
   input  logic              clk_i,
   input  logic              rst_ni,
   input  logic              we_i,
   input  logic [NPower-1:0] a_x,    // Share a of x
   input  logic [NPower-1:0] a_y,    // Share a of y
   input  logic [NPower-1:0] b_x,    // Share b of x
   input  logic [NPower-1:0] b_y,    // Share b of y
   input  logic [NPower-1:0] z_0,    // Randomness for resharing
   output logic [NPower-1:0] a_q,    // Share a of q
   output logic [NPower-1:0] b_q     // Share b of q
 );

   import aes_sbox_canright_pkg::*;

   /////////////////
   // Calculation //
   /////////////////
   // Inner-domain terms
   logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
   if (NPower == 4) begin : gen_inner_mul_gf2p4
     assign mul_ax_ay_d = aes_mul_gf2p4(a_x, a_y);
     assign mul_bx_by_d = aes_mul_gf2p4(b_x, b_y);

   end else begin : gen_inner_mul_gf2p2
     assign mul_ax_ay_d = aes_mul_gf2p2(a_x, a_y);
     assign mul_bx_by_d = aes_mul_gf2p2(b_x, b_y);
   end

   // Cross-domain terms
   logic [NPower-1:0] mul_ax_by, mul_ay_bx;
   if (NPower == 4) begin : gen_cross_mul_gf2p4
     assign mul_ax_by = aes_mul_gf2p4(a_x, b_y);
     assign mul_ay_bx = aes_mul_gf2p4(a_y, b_x);

   end else begin : gen_cross_mul_gf2p2
     assign mul_ax_by = aes_mul_gf2p2(a_x, b_y);
     assign mul_ay_bx = aes_mul_gf2p2(a_y, b_x);
   end

   ///////////////
   // Resharing //
   ///////////////
   // Resharing of cross-domain terms
   logic [NPower-1:0] aq_z0_d, bq_z0_d;
   logic [NPower-1:0] aq_z0_q, bq_z0_q;
   assign aq_z0_d = z_0 ^ mul_ax_by;
   assign bq_z0_d = z_0 ^ mul_ay_bx;

   // Registers
   prim_flop_en #(
     .Width      ( 2*NPower ),
     .ResetValue ( '0       )
   ) u_prim_flop_abq_z0 (
     .clk_i  ( clk_i              ),
     .rst_ni ( rst_ni             ),
     .en_i   ( we_i               ),
     .d_i    ( {aq_z0_d, bq_z0_d} ),
     .q_o    ( {aq_z0_q, bq_z0_q} )
   );

   /////////////////////////
   // Optional Pipelining //
   /////////////////////////
   logic [NPower-1:0] mul_ax_ay, mul_bx_by;

   if (Pipeline == 1'b1) begin : gen_pipeline
     // Add pipeline registers on inner-domain terms prior to integration. This allows accepting new
     // input data every clock cycle and prevents SCA leakage occurring due to the integration of
     // reshared cross-domain terms with inner-domain terms derived from different input data.

     logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
     prim_flop_en #(
       .Width      ( 2*NPower ),
       .ResetValue ( '0       )
     ) u_prim_flop_mul_abx_aby (
       .clk_i  ( clk_i                      ),
       .rst_ni ( rst_ni                     ),
       .en_i   ( we_i                       ),
       .d_i    ( {mul_ax_ay_d, mul_bx_by_d} ),
       .q_o    ( {mul_ax_ay_q, mul_bx_by_q} )
     );

     assign mul_ax_ay = mul_ax_ay_q;
     assign mul_bx_by = mul_bx_by_q;

   end else begin : gen_no_pipeline
     // Do not add the optional pipeline registers on the inner-domain terms. This allows to save
     // some area in case the multiplier does not need to accept new data in every cycle. However,
     // this can cause SCA leakage as during the clock cycle in which new data arrives, the new
     // inner-domain terms are integrated with the previous, reshared cross-domain terms.

     // Avoid aggressive synthesis optimizations.
     logic [NPower-1:0] mul_ax_ay_buf, mul_bx_by_buf;
     prim_buf #(
       .Width  ( 2*NPower )
     ) u_prim_buf_mul_abx_aby (
       .in_i  ( {mul_ax_ay_d,   mul_bx_by_d}   ),
       .out_o ( {mul_ax_ay_buf, mul_bx_by_buf} )
     );

     assign mul_ax_ay = mul_ax_ay_buf;
     assign mul_bx_by = mul_bx_by_buf;
   end

   /////////////////
   // Integration //
   /////////////////
   assign a_q = mul_ax_ay ^ aq_z0_q;
   assign b_q = mul_bx_by ^ bq_z0_q;

   // Only GF(2^4) and GF(2^2) is supported.
   `ASSERT_INIT(AesDomIndepMulPower, NPower == 4 || NPower == 2)

 endmodule

 // DOM-dep GF(2^N) multiplier, first-order masked.
 // Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
 // domain-oriented masking. The sharings of x and y are NOT required to be independent from each
 // other. This is the un-optimized version consuming 3 times N bits of randomness for blinding and
 // resharing. It is not used in the design but we keep it for reference.
 // See Fig. 4 and Formulas 8 - 11 in [1].
 module aes_dom_dep_mul_gf2pn_unopt #(
   parameter int unsigned NPower   = 4,
   parameter bit          Pipeline = 1'b0
 ) (
   input  logic              clk_i,
   input  logic              rst_ni,
   input  logic              we_i,
   input  logic [NPower-1:0] a_x,    // Share a of x
   input  logic [NPower-1:0] a_y,    // Share a of y
   input  logic [NPower-1:0] b_x,    // Share b of x
   input  logic [NPower-1:0] b_y,    // Share b of y
   input  logic [NPower-1:0] a_z,    // Randomness for blinding
   input  logic [NPower-1:0] b_z,    // Randomness for blinding
   input  logic [NPower-1:0] z_0,    // Randomness for resharing
   output logic [NPower-1:0] a_q,    // Share a of q
   output logic [NPower-1:0] b_q     // Share b of q
 );

   import aes_sbox_canright_pkg::*;

   //////////////
   // Blinding //
   //////////////
   // Blinding of y by z.
   logic [NPower-1:0] a_yz_d, b_yz_d;
   logic [NPower-1:0] a_yz_q, b_yz_q;
   assign a_yz_d = a_y ^ a_z;
   assign b_yz_d = b_y ^ b_z;

   // Registers
   prim_flop_en #(
     .Width      ( 2*NPower ),
     .ResetValue ( '0       )
   ) u_prim_flop_ab_yz (
     .clk_i  ( clk_i            ),
     .rst_ni ( rst_ni           ),
     .en_i   ( we_i             ),
     .d_i    ( {a_yz_d, b_yz_d} ),
     .q_o    ( {a_yz_q, b_yz_q} )
   );

   ////////////////
   // Correction //
   ////////////////
   logic [NPower-1:0] a_mul_x_z, b_mul_x_z;
   aes_dom_indep_mul_gf2pn #(
     .NPower   ( NPower   ),
     .Pipeline ( Pipeline )
   ) u_aes_dom_indep_mul_gf2pn (
     .clk_i  ( clk_i     ),
     .rst_ni ( rst_ni    ),
     .we_i   ( we_i      ),
     .a_x    ( a_x       ), // Share a of x
     .a_y    ( a_z       ), // Share a of z
     .b_x    ( b_x       ), // Share b of x
     .b_y    ( b_z       ), // Share b of z
     .z_0    ( z_0       ), // Randomness for resharing
     .a_q    ( a_mul_x_z ), // Share a of x * z
     .b_q    ( b_mul_x_z )  // Share b of x * z
   );

   /////////////////////////
   // Optional Pipelining //
   /////////////////////////
   logic [NPower-1:0] a_x_calc, b_x_calc;

   if (Pipeline == 1'b1) begin : gen_pipeline
     // Add pipeline registers for input x. This allows accepting new input data every clock cycle
     // and prevents SCA leakage occurring due to the multiplication of input x with b belonging to
     // different clock cycles.

     logic [NPower-1:0] a_x_q, b_x_q;
     prim_flop_en #(
       .Width      ( 2*NPower ),
       .ResetValue ( '0       )
     ) u_prim_flop_ab_x (
       .clk_i  ( clk_i          ),
       .rst_ni ( rst_ni         ),
       .en_i   ( we_i           ),
       .d_i    ( {a_x,   b_x}   ),
       .q_o    ( {a_x_q, b_x_q} )
     );

     assign a_x_calc = a_x_q;
     assign b_x_calc = b_x_q;

   end else begin : gen_no_pipeline
     // Do not add the optional pipeline registers for input x. This allows to save some area in
     // case the multiplier does not need to accept new data in every cycle. However, this can cause
     // SCA leakage as during the clock cycle in which new data arrives, the new x input is
     // multiplied with the previous b.

     assign a_x_calc = a_x;
     assign b_x_calc = b_x;
   end

   /////////////////
   // Calculation //
   /////////////////
   // Combine shares of blinded y to obtain b.
   logic [NPower-1:0] b;
   assign b = a_yz_q ^ b_yz_q;

   logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
   if (NPower == 4) begin : gen_mul_gf2p4
     assign a_mul_ax_b = aes_mul_gf2p4(a_x_calc, b);
     assign b_mul_bx_b = aes_mul_gf2p4(b_x_calc, b);

   end else begin : gen_mul_gf2p2
     assign a_mul_ax_b = aes_mul_gf2p2(a_x_calc, b);
     assign b_mul_bx_b = aes_mul_gf2p2(b_x_calc, b);
   end

   /////////////////
   // Integration //
   /////////////////
   assign a_q = a_mul_x_z ^ a_mul_ax_b;
   assign b_q = b_mul_x_z ^ b_mul_bx_b;

   // Only GF(2^4) and GF(2^2) is supported.
   `ASSERT_INIT(AesDomDepMulUnoptPower, NPower == 4 || NPower == 2)

 endmodule

 // DOM-dep GF(2^N) multiplier, first-order masked.
 // Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
 // domain-oriented masking. The sharings of x and y are NOT required to be independent from each
 // other. This is the optimized version consuming 2 instead of 3 times N bits of randomness for
 // blinding and resharing.
 // See Formula 12 in [1].
 module aes_dom_dep_mul_gf2pn #(
   parameter int unsigned NPower      = 4,
   parameter bit          Pipeline    = 1'b0,
   parameter bit          PreDomIndep = 1'b0 // 1'b0: Not followed by an un-pipelined DOM-indep
                                             //       multiplier, this enables additional area
                                             //       optimizations
                                             // 1'b1: Directly followed by an un-pipelined
                                             //       DOM-indep multiplier, this is the version
                                             //       discussed in [1].
 ) (
   input  logic              clk_i,
   input  logic              rst_ni,
   input  logic              we_i,
   input  logic [NPower-1:0] a_x,    // Share a of x
   input  logic [NPower-1:0] a_y,    // Share a of y
   input  logic [NPower-1:0] b_x,    // Share b of x
   input  logic [NPower-1:0] b_y,    // Share b of y
   input  logic [NPower-1:0] z_0,    // Randomness for blinding
   input  logic [NPower-1:0] z_1,    // Randomness for resharing
   output logic [NPower-1:0] a_q,    // Share a of q
   output logic [NPower-1:0] b_q     // Share b of q
 );

   import aes_sbox_canright_pkg::*;

   //////////////
   // Blinding //
   //////////////
   // Blinding of y by z_0.
   logic [NPower-1:0] a_yz0_d, b_yz0_d;
   logic [NPower-1:0] a_yz0_q, b_yz0_q;
   assign a_yz0_d = a_y ^ z_0;
   assign b_yz0_d = b_y ^ z_0;

   // Registers
   prim_flop_en #(
     .Width      ( 2*NPower ),
     .ResetValue ( '0       )
   ) u_prim_flop_ab_yz0 (
     .clk_i  ( clk_i              ),
     .rst_ni ( rst_ni             ),
     .en_i   ( we_i               ),
     .d_i    ( {a_yz0_d, b_yz0_d} ),
     .q_o    ( {a_yz0_q, b_yz0_q} )
   );

   ////////////////
   // Correction //
   ////////////////
   // Basically, this a DOM-indep multiplier with:
   // - a_x = a_x, b_x = b_x, and
   // - a_y = z_0, b_y = 0 (constant),
   // which allows for further optimizations.

   // Calculation
   logic [NPower-1:0] mul_ax_z0, mul_bx_z0;
   if (NPower == 4) begin : gen_corr_mul_gf2p4
     assign mul_ax_z0 = aes_mul_gf2p4(a_x, z_0);
     assign mul_bx_z0 = aes_mul_gf2p4(b_x, z_0);

   end else begin : gen_corr_mul_gf2p2
     assign mul_ax_z0 = aes_mul_gf2p2(a_x, z_0);
     assign mul_bx_z0 = aes_mul_gf2p2(b_x, z_0);
   end

   // Avoid aggressive synthesis optimizations.
   logic [NPower-1:0] mul_ax_z0_buf, mul_bx_z0_buf;
   prim_buf #(
     .Width ( 2*NPower )
   ) u_prim_buf_mul_abx_z0 (
     .in_i  ( {mul_ax_z0,     mul_bx_z0}     ),
     .out_o ( {mul_ax_z0_buf, mul_bx_z0_buf} )
   );

   // Resharing
   logic [NPower-1:0] axz0_z1_d, bxz0_z1_d;
   logic [NPower-1:0] axz0_z1_q, bxz0_z1_q;
   assign axz0_z1_d = mul_ax_z0_buf ^ z_1;
   assign bxz0_z1_d = mul_bx_z0_buf ^ z_1;

   // Registers
   prim_flop_en #(
     .Width      ( 2*NPower ),
     .ResetValue ( '0       )
   ) u_prim_flop_abxz0_z1 (
     .clk_i  ( clk_i                  ),
     .rst_ni ( rst_ni                 ),
     .en_i   ( we_i                   ),
     .d_i    ( {axz0_z1_d, bxz0_z1_d} ),
     .q_o    ( {axz0_z1_q, bxz0_z1_q} )
   );

   /////////////////////////
   // Optional Pipelining //
   /////////////////////////
   logic [NPower-1:0] a_x_calc, b_x_calc, a_y_calc, b_y_calc;

   if (Pipeline == 1'b1 && PreDomIndep != 1'b1) begin : gen_pipeline
     // Add pipeline registers for inputs x and y. This allows accepting new input data every clock
     // cycle and prevents SCA leakage occurring due to the multiplication of inputs x and y with
     // d_b belonging to different clock cycles.
     //
     // The PreDomIndep variant has the required pipeline registers built in already.

     logic [NPower-1:0] a_x_q, b_x_q, a_y_q, b_y_q;
     prim_flop_en #(
       .Width      ( 4*NPower ),
       .ResetValue ( '0       )
     ) u_prim_flop_ab_xy (
       .clk_i  ( clk_i                        ),
       .rst_ni ( rst_ni                       ),
       .en_i   ( we_i                         ),
       .d_i    ( {a_x,   b_x,   a_y,   b_y}   ),
       .q_o    ( {a_x_q, b_x_q, a_y_q, b_y_q} )
     );

     assign a_x_calc = a_x_q;
     assign b_x_calc = b_x_q;
     assign a_y_calc = a_y_q;
     assign b_y_calc = b_y_q;

   end else begin : gen_no_pipeline
     // Do not add the optional pipeline registers for inputs x and y. This allows to save some area
     // in case the multiplier does not need to accept new data in every cycle. However, this can
     // cause SCA leakage as during the clock cycle in which new data arrives, the new x and y
     // inputs are multiplied with the previous d_b.

     assign a_x_calc = a_x;
     assign b_x_calc = b_x;
     assign a_y_calc = a_y;
     assign b_y_calc = b_y;
   end

   ///////////////////////////////
   // Calculation & Integration //
   ///////////////////////////////
   // Compute b. Note that unlike for the unoptimized implementation, we don't combine the blinded
   // shares of y to obtain a single b value. Intstead, every domain d gets its own version of b:
   //
   //   d_b = d_y ^ _D_y_z0
   //
   // where _D_y_z0 corresponds to the sum of all domains of y except for domain d, each
   // individually blinded by z0 (needs to happen before the register bank). This optimization
   // is only suitable for first-order masking.
   // See Formula 12 in [1].

   if (PreDomIndep == 1'b1) begin : gen_pre_dom_indep
     // This DOM-dep multiplier is directly followed by an un-pipelined DOM-indep multiplier. To
     // prevent SCA leakage in the un-pipelined DOM-indep multiplier, the d_y and _D_y_z0 parts of
     // d_b need to be individually multiplied with input x and then the results need to be
     // integrated (summed up) on a per-domain basis.

     // d_y part: Inner-domain terms of x * y
     logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
     logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
     if (NPower == 4) begin : gen_inner_mul_gf2p4
       assign mul_ax_ay_d = aes_mul_gf2p4(a_x_calc, a_y_calc);
       assign mul_bx_by_d = aes_mul_gf2p4(b_x_calc, b_y_calc);

     end else begin : gen_inner_mul_gf2p2
       assign mul_ax_ay_d = aes_mul_gf2p2(a_x_calc, a_y_calc);
       assign mul_bx_by_d = aes_mul_gf2p2(b_x_calc, b_y_calc);
     end

     // Registers
     prim_flop_en #(
       .Width      ( 2*NPower ),
       .ResetValue ( '0       )
     ) u_prim_flop_mul_abx_aby (
       .clk_i  ( clk_i                      ),
       .rst_ni ( rst_ni                     ),
       .en_i   ( we_i                       ),
       .d_i    ( {mul_ax_ay_d, mul_bx_by_d} ),
       .q_o    ( {mul_ax_ay_q, mul_bx_by_q} )
     );

     // Input Registers
     logic [NPower-1:0] a_x_q, b_x_q;
     prim_flop_en #(
       .Width      ( 2*NPower ),
       .ResetValue ( '0       )
     ) u_prim_flop_ab_xy (
       .clk_i  ( clk_i                ),
       .rst_ni ( rst_ni               ),
       .en_i   ( we_i                 ),
       .d_i    ( {a_x_calc, b_x_calc} ),
       .q_o    ( {a_x_q,    b_x_q}    )
     );

     // _D_y_z0 part: Cross-domain terms: d_x * _D_y_z0
     // Need to use registered version of input x.
     logic [NPower-1:0] mul_ax_byz0, mul_bx_ayz0;
     if (NPower == 4) begin : gen_cross_mul_gf2p4
       assign mul_ax_byz0 = aes_mul_gf2p4(a_x_q, b_yz0_q);
       assign mul_bx_ayz0 = aes_mul_gf2p4(b_x_q, a_yz0_q);

     end else begin : gen_cross_mul_gf2p2
       assign mul_ax_byz0 = aes_mul_gf2p2(a_x_q, b_yz0_q);
       assign mul_bx_ayz0 = aes_mul_gf2p2(b_x_q, a_yz0_q);
     end

     // Avoid aggressive synthesis optimizations.
     logic [NPower-1:0] mul_ax_byz0_buf, mul_bx_ayz0_buf;
     prim_buf #(
       .Width ( 2*NPower )
     ) u_prim_buf_mul_abx_bayz0 (
       .in_i  ( {mul_ax_byz0,     mul_bx_ayz0}     ),
       .out_o ( {mul_ax_byz0_buf, mul_bx_ayz0_buf} )
     );

     // Integration
     assign a_q = axz0_z1_q ^ mul_ax_ay_q ^ mul_ax_byz0_buf;
     assign b_q = bxz0_z1_q ^ mul_bx_by_q ^ mul_bx_ayz0_buf;

   end else begin : gen_not_pre_dom_indep
     // This DOM-dep multiplier is not directly followed by an un-pipelined DOM-indep multiplier. As
     // a result, the the d_y and _D_y_z0 parts of d_b can be summed up prior to the multiplication
     // with input x which allows saving 2 GF multipliers.

     // Sum up d_y and _D_y_z0.
     logic [NPower-1:0] a_b, b_b;
     assign a_b = a_y_calc ^ b_yz0_q;
     assign b_b = b_y_calc ^ a_yz0_q;

     // Avoid aggressive synthesis optimizations.
     logic [NPower-1:0] a_b_buf, b_b_buf;
     prim_buf #(
       .Width ( 2*NPower )
     ) u_prim_buf_ab_b (
       .in_i  ( {a_b,     b_b}     ),
       .out_o ( {a_b_buf, b_b_buf} )
     );

     // GF multiplications
     logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
     if (NPower == 4) begin : gen_mul_gf2p4
       assign a_mul_ax_b = aes_mul_gf2p4(a_x_calc, a_b_buf);
       assign b_mul_bx_b = aes_mul_gf2p4(b_x_calc, b_b_buf);
     end else begin : gen_mul_gf2p2
       assign a_mul_ax_b = aes_mul_gf2p2(a_x_calc, a_b_buf);
       assign b_mul_bx_b = aes_mul_gf2p2(b_x_calc, b_b_buf);
     end

     // Avoid aggressive synthesis optimizations.
     logic [NPower-1:0] a_mul_ax_b_buf, b_mul_bx_b_buf;
     prim_buf #(
       .Width ( 2*NPower )
     ) u_prim_buf_ab_mul_abx_b (
       .in_i  ( {a_mul_ax_b,     b_mul_bx_b}     ),
       .out_o ( {a_mul_ax_b_buf, b_mul_bx_b_buf} )
     );

     // Integration
     assign a_q = axz0_z1_q ^ a_mul_ax_b_buf;
     assign b_q = bxz0_z1_q ^ b_mul_bx_b_buf;
   end

   // Only GF(2^4) and GF(2^2) is supported.
   `ASSERT_INIT(AesDomDepMulPower, NPower == 4 || NPower == 2)

 endmodule

 // Inverse in GF(2^4) using first-order domain-oriented masking and normal basis [z^4, z].
 // See Fig. 6 in [2] (grey block, Stages 2 and 3) and Formulas 6, 13, 14, 15, 16, 17 in [2].
 module aes_dom_inverse_gf2p4 (
   input  logic        clk_i,
   input  logic        rst_ni,
   input  logic  [1:0] we_i,
   input  logic  [3:0] a_gamma,
   input  logic  [3:0] b_gamma,
   input  logic  [3:0] prd_2,
   input  logic  [7:0] prd_3,
   output logic  [3:0] a_gamma_inv,
   output logic  [3:0] b_gamma_inv
 );

   import aes_sbox_canright_pkg::*;

   /////////////
   // Stage 2 //
   /////////////
   // Formula 13 in [2].

   logic [1:0] a_gamma1, a_gamma0, b_gamma1, b_gamma0, a_gamma1_gamma0, b_gamma1_gamma0;
   assign a_gamma1 = a_gamma[3:2];
   assign a_gamma0 = a_gamma[1:0];
   assign b_gamma1 = b_gamma[3:2];
   assign b_gamma0 = b_gamma[1:0];

   logic [1:0] a_gamma_ss_d, b_gamma_ss_d;
   logic [1:0] a_gamma_ss_q, b_gamma_ss_q;
   assign a_gamma_ss_d = aes_scale_omega2_gf2p2(aes_square_gf2p2(a_gamma1 ^ a_gamma0));
   assign b_gamma_ss_d = aes_scale_omega2_gf2p2(aes_square_gf2p2(b_gamma1 ^ b_gamma0));
   prim_flop_en #(
     .Width      ( 4  ),
     .ResetValue ( '0 )
   ) u_prim_flop_ab_gamma_ss (
     .clk_i  ( clk_i                        ),
     .rst_ni ( rst_ni                       ),
     .en_i   ( we_i[0]                      ),
     .d_i    ( {a_gamma_ss_d, b_gamma_ss_d} ),
     .q_o    ( {a_gamma_ss_q, b_gamma_ss_q} )
   );

   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 2    ),
     .Pipeline    ( 1'b1 ),
     .PreDomIndep ( 1'b0 )
   ) u_aes_dom_mul_gamma1_gamma0 (
     .clk_i  ( clk_i           ),
     .rst_ni ( rst_ni          ),
     .we_i   ( we_i[0]         ),
     .a_x    ( a_gamma1        ), // Share a of x
     .a_y    ( a_gamma0        ), // Share a of y
     .b_x    ( b_gamma1        ), // Share b of x
     .b_y    ( b_gamma0        ), // Share b of y
     .z_0    ( prd_2[1:0]      ), // Randomness for blinding
     .z_1    ( prd_2[3:2]      ), // Randomness for resharing
     .a_q    ( a_gamma1_gamma0 ), // Share a of q
     .b_q    ( b_gamma1_gamma0 )  // Share b of q
   );

   /////////////
   // Stage 3 //
   /////////////

   // Formulas 14 and 15 in [2].
   logic [1:0] a_omega, b_omega;
   assign a_omega = aes_square_gf2p2(a_gamma1_gamma0 ^ a_gamma_ss_q);
   assign b_omega = aes_square_gf2p2(b_gamma1_gamma0 ^ b_gamma_ss_q);

   // Avoid aggressive synthesis optimizations.
   logic [1:0] a_omega_buf, b_omega_buf;
   prim_buf #(
     .Width ( 4 )
   ) u_prim_buf_ab_omega (
     .in_i  ( {a_omega,     b_omega}     ),
     .out_o ( {a_omega_buf, b_omega_buf} )
   );

   // Formulas 16 and 17 in [2].

   logic [1:0] a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q;
   prim_flop_en #(
     .Width      ( 8  ),
     .ResetValue ( '0 )
   ) u_prim_flop_ab_gamma10 (
     .clk_i  ( clk_i                                            ),
     .rst_ni ( rst_ni                                           ),
     .en_i   ( we_i[0]                                          ),
     .d_i    ( {a_gamma1,   a_gamma0,   b_gamma1,   b_gamma0}   ),
     .q_o    ( {a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q} )
   );

   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 2    ),
     .Pipeline    ( 1'b1 ),
     .PreDomIndep ( 1'b0 )
   ) u_aes_dom_mul_omega_gamma1 (
     .clk_i  ( clk_i            ),
     .rst_ni ( rst_ni           ),
     .we_i   ( we_i[1]          ),
     .a_x    ( a_gamma1_q       ), // Share a of x
     .a_y    ( a_omega_buf      ), // Share a of y
     .b_x    ( b_gamma1_q       ), // Share b of x
     .b_y    ( b_omega_buf      ), // Share b of y
     .z_0    ( prd_3[5:4]       ), // Randomness for blinding
     .z_1    ( prd_3[7:6]       ), // Randomness for resharing
     .a_q    ( a_gamma_inv[1:0] ), // Share a of q
     .b_q    ( b_gamma_inv[1:0] )  // Share b of q
   );

   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 2    ),
     .Pipeline    ( 1'b1 ),
     .PreDomIndep ( 1'b0 )
   ) u_aes_dom_mul_omega_gamma0 (
     .clk_i  ( clk_i            ),
     .rst_ni ( rst_ni           ),
     .we_i   ( we_i[1]          ),
     .a_x    ( a_omega_buf      ), // Share a of x
     .a_y    ( a_gamma0_q       ), // Share a of y
     .b_x    ( b_omega_buf      ), // Share b of x
     .b_y    ( b_gamma0_q       ), // Share b of y
     .z_0    ( prd_3[1:0]       ), // Randomness for blinding
     .z_1    ( prd_3[3:2]       ), // Randomness for resharing
     .a_q    ( a_gamma_inv[3:2] ), // Share a of q
     .b_q    ( b_gamma_inv[3:2] )  // Share b of q
   );

 endmodule

 // Inverse in GF(2^8) using first-order domain-oriented masking and normal basis [y^16, y].
 // See Fig. 6 in [1] and Formulas 3, 12, 18 and 19 in [2].
 module aes_dom_inverse_gf2p8 (
   input  logic        clk_i,
   input  logic        rst_ni,
   input  logic  [3:0] we_i,
   input  logic  [7:0] a_y,     // input data masked by b_y
   input  logic  [7:0] b_y,     // input mask
   input  prd_t        prd,     // pseudo-random data, e.g. for intermediate masks
   output logic  [7:0] a_y_inv, // output data masked by b_y_inv
   output logic  [7:0] b_y_inv  // output mask
 );

   import aes_sbox_canright_pkg::*;

   /////////////
   // Stage 1 //
   /////////////
   // Formula 12 in [2].

   logic [3:0] a_y1, a_y0, b_y1, b_y0, a_y1_y0, b_y1_y0;
   assign a_y1 = a_y[7:4];
   assign a_y0 = a_y[3:0];
   assign b_y1 = b_y[7:4];
   assign b_y0 = b_y[3:0];

   logic [3:0] a_y_ss_d, b_y_ss_d;
   logic [3:0] a_y_ss_q, b_y_ss_q;
   assign a_y_ss_d = aes_square_scale_gf2p4_gf2p2(a_y1 ^ a_y0);
   assign b_y_ss_d = aes_square_scale_gf2p4_gf2p2(b_y1 ^ b_y0);
   prim_flop_en #(
     .Width      ( 8  ),
     .ResetValue ( '0 )
   ) u_prim_flop_ab_y_ss (
     .clk_i  ( clk_i                ),
     .rst_ni ( rst_ni               ),
     .en_i   ( we_i[0]              ),
     .d_i    ( {a_y_ss_d, b_y_ss_d} ),
     .q_o    ( {a_y_ss_q, b_y_ss_q} )
   );

   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 4    ),
     .Pipeline    ( 1'b1 ),
     .PreDomIndep ( 1'b0 )
   ) u_aes_dom_mul_y1_y0 (
     .clk_i  ( clk_i          ),
     .rst_ni ( rst_ni         ),
     .we_i   ( we_i[0]        ),
     .a_x    ( a_y1           ), // Share a of x
     .a_y    ( a_y0           ), // Share a of y
     .b_x    ( b_y1           ), // Share b of x
     .b_y    ( b_y0           ), // Share b of y
     .z_0    ( prd.prd_1[3:0] ), // Randomness for blinding
     .z_1    ( prd.prd_1[7:4] ), // Randomness for resharing
     .a_q    ( a_y1_y0        ), // Share a of q
     .b_q    ( b_y1_y0        )  // Share b of q
   );

   logic [3:0] a_gamma, b_gamma;
   assign a_gamma = a_y_ss_q ^ a_y1_y0;
   assign b_gamma = b_y_ss_q ^ b_y1_y0;

   // Avoid aggressive synthesis optimizations.
   logic [3:0] a_gamma_buf, b_gamma_buf;
   prim_buf #(
     .Width ( 8 )
   ) u_prim_buf_ab_gamma (
     .in_i  ( {a_gamma,     b_gamma}     ),
     .out_o ( {a_gamma_buf, b_gamma_buf} )
   );

   ////////////////////
   // Stages 2 and 3 //
   ////////////////////

   logic [3:0] a_theta, b_theta;

   // a_gamma is masked by b_gamma, a_gamma_inv is masked by b_gamma_inv.
   aes_dom_inverse_gf2p4 u_aes_dom_inverse_gf2p4 (
     .clk_i       ( clk_i       ),
     .rst_ni      ( rst_ni      ),
     .we_i        ( we_i[2:1]   ),
     .a_gamma     ( a_gamma_buf ),
     .b_gamma     ( b_gamma_buf ),
     .prd_2       ( prd.prd_2   ),
     .prd_3       ( prd.prd_3   ),
     .a_gamma_inv ( a_theta     ),
     .b_gamma_inv ( b_theta     )
   );

   /////////////
   // Stage 4 //
   /////////////
   // Formulas 18 and 19 in [2].

   logic [3:0] a_y1_q, a_y0_q, b_y1_q, b_y0_q;
   prim_flop_en #(
     .Width      ( 16 ),
     .ResetValue ( '0 )
   ) u_prim_flop_ab_y10 (
     .clk_i  ( clk_i                            ),
     .rst_ni ( rst_ni                           ),
     .en_i   ( we_i[2]                          ),
     .d_i    ( {a_y1,   a_y0,   b_y1,   b_y0}   ),
     .q_o    ( {a_y1_q, a_y0_q, b_y1_q, b_y0_q} )
   );

   aes_dom_indep_mul_gf2pn #(
     .NPower   ( 4    ),
     .Pipeline ( 1'b1 )
   ) u_aes_dom_mul_theta_y1 (
     .clk_i  ( clk_i          ),
     .rst_ni ( rst_ni         ),
     .we_i   ( we_i[3]        ),
     .a_x    ( a_y1_q         ), // Share a of x
     .a_y    ( a_theta        ), // Share a of y
     .b_x    ( b_y1_q         ), // Share b of x
     .b_y    ( b_theta        ), // Share b of y
     .z_0    ( prd.prd_4[7:4] ), // Randomness for resharing
     .a_q    ( a_y_inv[3:0]   ), // Share a of q
     .b_q    ( b_y_inv[3:0]   )  // Share b of q
   );

   aes_dom_indep_mul_gf2pn #(
     .NPower   ( 4    ),
     .Pipeline ( 1'b1 )
   ) u_aes_dom_mul_theta_y0 (
     .clk_i  ( clk_i          ),
     .rst_ni ( rst_ni         ),
     .we_i   ( we_i[3]        ),
     .a_x    ( a_theta        ), // Share a of x
     .a_y    ( a_y0_q         ), // Share a of y
     .b_x    ( b_theta        ), // Share b of x
     .b_y    ( b_y0_q         ), // Share b of y
     .z_0    ( prd.prd_4[3:0] ), // Randomness for resharing
     .a_q    ( a_y_inv[7:4]   ), // Share a of q
     .b_q    ( b_y_inv[7:4]   )  // Share b of q
   );

 endmodule

 module aes_sbox_dom (
   input  logic              clk_i,
   input  logic              rst_ni,
   input  logic              en_i,
   output logic              out_req_o,
   input  logic              out_ack_i,
   input  aes_pkg::ciph_op_e op_i,
   input  logic        [7:0] data_i, // masked, the actual input data is data_i ^ mask_i
   input  logic        [7:0] mask_i, // input mask
   input  logic        [7:0] prd_i,  // pseudo-random data for remasking, in total we need 28 bits
                                     // of PRD per evaluation, but at most 8 bits per cycle
   output logic        [7:0] data_o, // masked, the actual output data is data_o ^ mask_o
   output logic        [7:0] mask_o  // output mask
 );

   import aes_pkg::*;
   import aes_sbox_canright_pkg::*;

   logic [7:0] in_data_basis_x, out_data_basis_x;
   logic [7:0] in_mask_basis_x, out_mask_basis_x;
   logic [3:0] we;
   prd_t       prd_d, prd_q;

   // Convert data to normal basis X.
   assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
                                                 aes_mvm(data_i ^ 8'h63, S2X);

   // Convert mask to normal basis X.
   // The addition of constant 8'h63 prior to the affine transformation is skipped.
   assign in_mask_basis_x  = (op_i == CIPH_FWD) ? aes_mvm(mask_i, A2X) :
                                                  aes_mvm(mask_i, S2X);

   // Do the inversion in normal basis X.
   aes_dom_inverse_gf2p8 u_aes_dom_inverse_gf2p8 (
     .clk_i   ( clk_i            ),
     .rst_ni  ( rst_ni           ),
     .we_i    ( we               ),
     .a_y     ( in_data_basis_x  ), // input
     .b_y     ( in_mask_basis_x  ), // input
     .prd     ( prd_d            ), // input
     .a_y_inv ( out_data_basis_x ), // output
     .b_y_inv ( out_mask_basis_x )  // output
   );

   // Convert data to basis S or A.
   assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
                                        (aes_mvm(out_data_basis_x, X2A));

   // Convert mask to basis S or A.
   // The addition of constant 8'h63 following the affine transformation is skipped.
   assign mask_o = (op_i == CIPH_FWD) ? aes_mvm(out_mask_basis_x, X2S) :
                                        aes_mvm(out_mask_basis_x, X2A);

   // Counter register
   logic [2:0] count_d, count_q;
   assign count_d = (out_req_o && out_ack_i) ? '0             :
                    out_req_o                ? count_q        :
                    en_i                     ? count_q + 3'd1 : count_q;
   always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
     if (!rst_ni) begin
       count_q <= '0;
     end else begin
       count_q <= count_d;
     end
   end
   assign out_req_o = en_i & count_q == 3'd4;

   // Write enable signals for internal registers
   assign we[0] = en_i & count_q == 3'd0;
   assign we[1] = en_i & count_q == 3'd1;
   assign we[2] = en_i & count_q == 3'd2;
   assign we[3] = en_i & count_q == 3'd3;

   // Buffer and forward PRD for the individual stages. We get 8 bits per cycle from the PRNG.
   // Stage 1, 3 and 4 require 8 bits each. Stage 2 requires just 4 bits.
   always_comb begin : iv_mux
     unique case (we)
       4'b0000: prd_d = prd_q;
       4'b0001: prd_d = '{prd_1: prd_i,
                          prd_2: prd_q.prd_2,
                          prd_3: prd_q.prd_3,
                          prd_4: prd_q.prd_4};
       4'b0010: prd_d = '{prd_1: prd_q.prd_1,
                          prd_2: prd_i[3:0],
                          prd_3: prd_q.prd_3,
                          prd_4: prd_q.prd_4};
       4'b0100: prd_d = '{prd_1: prd_q.prd_1,
                          prd_2: prd_q.prd_2,
                          prd_3: prd_i,
                          prd_4: prd_q.prd_4};
       4'b1000: prd_d = '{prd_1: prd_q.prd_1,
                          prd_2: prd_q.prd_2,
                          prd_3: prd_q.prd_3,
                          prd_4: prd_i};
       default: prd_d = prd_q;
     endcase
   end
   always_ff @(posedge clk_i or negedge rst_ni) begin : reg_prd
     if (!rst_ni) begin
       prd_q <= '0;
     end else if (|we) begin
       prd_q <= prd_d;
     end
   end

 endmodule