hw/ip/aes/rtl/aes_sbox_canright_masked.sv - 3p/lowrisc/opentitan - Git at Google

 // Copyright lowRISC contributors.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
 // AES Masked Canright SBox with Mask Re-Use
 //
 // For details, see the following paper:
 // Canright, "A very compact 'perfectly masked' S-box for AES (corrected)"
 // available at https://eprint.iacr.org/2009/011.pdf
 //
 // Note: This module implements the masked inversion algorithm with re-using masks.
 // For details, see Section 2.3 of the paper. Re-using masks may make the implementation more
 // vulnerable to higher-order differential side-channel analysis, but it remains secure against
 // first-order attacks. This implementation is commonly referred to as THE Canright Masked SBox.
 //
 // A formal analysis using REBECCA (static mode) shows that this implementation is not secure.
 // It is thus recommended to use the "noreuse" variant of the masked Canright S-Box.
 //
 // For details on the REBECCA tool, see the following paper:
 // Bloem, "Formal verification of masked hardware implementations in the presence of glitches"
 // available at https://eprint.iacr.org/2017/897.pdf

 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // IMPORTANT NOTE:                                                                               //
 //                            DO NOT USE THIS FOR SYNTHESIS BLINDLY!                             //
 //                                                                                               //
 // This implementation targets primarily Xilinx Vivado synthesis as well as RTL simulation. It   //
 // contains synthesis attributes specific to Xilinx Vivado to enforce the correct ordering of    //
 // operations and avoid aggressive optimization. Other synthesis tools might still heavily       //
 // optimize the design. The result is likely insecure. Use with care.                            //
 ///////////////////////////////////////////////////////////////////////////////////////////////////

 // Masked inverse in GF(2^4), using normal basis [z^4, z]
 // (see Formulas 6, 13, 14, 15, 21, 22, 23, 24 in the paper)
 module aes_masked_inverse_gf2p4 (
   input  logic [3:0] b,
   input  logic [3:0] q,
   input  logic [1:0] r,
   input  logic [3:0] m1,
   output logic [3:0] b_inv
 );

   import aes_pkg::*;
   import aes_sbox_canright_pkg::*;

   logic [1:0] b1, b0, q1, q0, c, r_sq, m11, m10, b1_inv, b0_inv;
   assign b1  = b[3:2];
   assign b0  = b[1:0];
   assign q1  = q[3:2];
   assign q0  = q[1:0];
   assign m11 = m1[3:2];
   assign m10 = m1[1:0];

   // Get re-usable intermediate results.
   (* keep = "true" *) logic [1:0] mul_b0_q1, mul_b1_q0, mul_q0_q1;
   assign mul_b0_q1 = aes_mul_gf2p2(b0, q1);
   assign mul_b1_q0 = aes_mul_gf2p2(b1, q0);
   assign mul_q0_q1 = aes_mul_gf2p2(q0, q1);

   // Formula 13
   // IMPORTANT: The following ops must be executed in order (left to right):
   // c = r ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0))
   //       ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0))
   //       ^ aes_mul_gf2p2(b1, b0)
   //       ^ mul_b1_q0 ^ mul_b0_q1 ^ mul_q0_q1;
   //
   // Get intermediate terms. The terms they are added to depend on the same inputs.
   // Avoid aggressive synthesis optimizations.
   (* keep = "true" *) logic [1:0] scale_omega2_b, scale_omega2_q;
   (* keep = "true" *) logic [1:0] mul_b1_b0;
   assign scale_omega2_b = aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0));
   assign scale_omega2_q = aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0));
   assign mul_b1_b0 = aes_mul_gf2p2(b1, b0);
   // Generate c step by step.
   (* keep = "true" *) logic [1:0] c_0, c_1, c_2, c_3, c_4;
   assign c_0 = r   ^ scale_omega2_b;
   assign c_1 = c_0 ^ scale_omega2_q;
   assign c_2 = c_1 ^ mul_b1_b0;
   assign c_3 = c_2 ^ mul_b1_q0;
   assign c_4 = c_3 ^ mul_b0_q1;
   assign c   = c_4 ^ mul_q0_q1;

   // Below, c1_inv and c2_inv (derived from the output of the inverse over GF(2^2)) are again
   // recombined with other inputs to c and c_inv. Aggressive synthesis optimizations across the
   // GF(2^2) inverter may result in SCA leakage and should be avoided.
   (* keep = "true" *) logic [1:0] c_inv, c1_inv, c2_inv;

   // Formulas 14 and 15
   assign c_inv = aes_square_gf2p2(c);
   assign r_sq  = aes_square_gf2p2(r);

   // Re-masking c_inv
   // Formulas 21 and 23
   // IMPORTANT: First combine the masks (ops in parens) then apply to c_inv:
   // c_inv  = c_inv ^ (q1 ^ r_sq);
   // c2_inv = c_inv ^ (q0 ^ q1);
   //
   // Generate c1_inv and c2_inv step by step.
   (* keep = "true" *) logic [1:0] xor_q1_r_sq, xor_q0_q1;
   assign xor_q1_r_sq = q1 ^ r_sq;
   assign xor_q0_q1   = q0 ^ q1;
   assign c1_inv      = c_inv  ^ xor_q1_r_sq;
   assign c2_inv      = c1_inv ^ xor_q0_q1;

   // Formulas 22 and 24
   // IMPORTANT: The following ops must be executed in order (left to right):
   // b1_inv = m11 ^ aes_mul_gf2p2(b0, c1_inv)
   //              ^ mul_b0_q1 ^ aes_mul_gf2p2(q0, c1_inv) ^ mul_q0_q1;
   // b0_inv = m10 ^ aes_mul_gf2p2(b1, c2_inv)
   //              ^ mul_b1_q0 ^ aes_mul_gf2p2(q1, c2_inv) ^ mul_q0_q1;
   //
   // Get intermediate terms. The terms they are added to depend on the same inputs.
   // Avoid aggressive synthesis optimizations.
   (* keep = "true" *) logic [1:0] mul_b0_c1_inv, mul_q0_c1_inv;
   (* keep = "true" *) logic [1:0] mul_b1_c2_inv, mul_q1_c2_inv;
   assign mul_b0_c1_inv = aes_mul_gf2p2(b0, c1_inv);
   assign mul_q0_c1_inv = aes_mul_gf2p2(q0, c1_inv);
   assign mul_b1_c2_inv = aes_mul_gf2p2(b1, c2_inv);
   assign mul_q1_c2_inv = aes_mul_gf2p2(q1, c2_inv);
   // Generate b1_inv and b0_inv step by step.
   (* keep = "true" *) logic [1:0] b1_inv_0, b1_inv_1, b1_inv_2;
   (* keep = "true" *) logic [1:0] b0_inv_0, b0_inv_1, b0_inv_2;
   assign b1_inv_0 = m11      ^ mul_b0_c1_inv;
   assign b1_inv_1 = b1_inv_0 ^ mul_b0_q1;
   assign b1_inv_2 = b1_inv_1 ^ mul_q0_c1_inv;
   assign b1_inv   = b1_inv_2 ^ mul_q0_q1;
   assign b0_inv_0 = m10      ^ mul_b1_c2_inv;
   assign b0_inv_1 = b0_inv_0 ^ mul_b1_q0;
   assign b0_inv_2 = b0_inv_1 ^ mul_q1_c2_inv;
   assign b0_inv   = b0_inv_2 ^ mul_q0_q1;

   // Note: b_inv is masked by m1, b was masked by q.
   assign b_inv = {b1_inv, b0_inv};
 endmodule

 // Masked inverse in GF(2^8), using normal basis [y^16, y]
 // (see Formulas 3, 12, 25, 26 and 27 in the paper)
 module aes_masked_inverse_gf2p8 (
   input  logic [7:0] a,
   input  logic [7:0] m,
   input  logic [7:0] n,
   output logic [7:0] a_inv
 );

   import aes_pkg::*;
   import aes_sbox_canright_pkg::*;

   logic [3:0] a1, a0, m1, m0, q, s1, s0, a1_inv, a0_inv;
   logic [1:0] r;

   // The output of the inverse over GF(2^4) and signals derived from that are again recombined
   // with inputs to the GF(2^4) inverter. Aggressive synthesis optimizations across the GF(2^4)
   // inverter may result in SCA leakage and should be avoided.
   (* keep = "true" *) logic [3:0] b, b_inv, b2_inv;

   assign a1 = a[7:4];
   assign a0 = a[3:0];
   assign m1 = m[7:4];
   assign m0 = m[3:0];

   // Get re-usable intermediate results.
   (* keep = "true" *) logic [3:0] mul_a0_m1, mul_a1_m0, mul_m0_m1;
   assign mul_a0_m1 = aes_mul_gf2p4(a0, m1);
   assign mul_a1_m0 = aes_mul_gf2p4(a1, m0);
   assign mul_m0_m1 = aes_mul_gf2p4(m0, m1);

   // q must be independent of m.
   assign q = n[7:4];

   // Formula 12
   // IMPORTANT: The following ops must be executed in order (left to right):
   // b = q ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0)
   //       ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0)
   //       ^ aes_mul_gf2p4(a1, a0)
   //       ^ mul_a1_m0 ^ mul_a0_m1 ^ mul_m0_m1;
   //
   // Get intermediate terms. The terms they are added to depend on the same inputs.
   // Avoid aggressive synthesis optimizations.
   (* keep = "true" *) logic [3:0] mul_a1_a0;
   assign mul_a1_a0 = aes_mul_gf2p4(a1, a0);
   // Generate b step by step.
   (* keep = "true" *) logic [3:0] b_0, b_1, b_2, b_3, b_4;
   assign b_0 = q   ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0); // q does not depend on a1, a0.
   assign b_1 = b_0 ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0); // b_0 does not depend on m1, m0.
   assign b_2 = b_1 ^ mul_a1_a0;
   assign b_3 = b_2 ^ mul_a1_m0;
   assign b_4 = b_3 ^ mul_a0_m1;
   assign b   = b_4 ^ mul_m0_m1;

   // r must be independent of q.
   assign r = m1[3:2];

   // b is masked by q, b_inv is masked by m1.
   aes_masked_inverse_gf2p4 u_aes_masked_inverse_gf2p4 (
     .b     ( b     ),
     .q     ( q     ),
     .r     ( r     ),
     .m1    ( m1    ),
     .b_inv ( b_inv )
   );

   // Formula 26
   // IMPORTANT: First combine the masks (ops in parens) then apply to b_inv:
   // b2_inv = b_inv ^ (m1 ^ m0);
   //
   // Generate b2_inv step by step.
   (* keep = "true" *) logic [3:0] xor_m1_m0;
   assign xor_m1_m0 = m1 ^ m0;
   assign b2_inv    = b_inv ^ xor_m1_m0;

   // s is the specified output mask n.
   assign s1 = n[7:4];
   assign s0 = n[3:0];

   // Formulas 25 and 27
   // IMPORTANT: The following ops must be executed in order (left to right):
   // a1_inv = s1 ^ aes_mul_gf2p4(a0, b_inv)
   //             ^ mul_a0_m1 ^ aes_mul_gf2p4(m0, b_inv)  ^ mul_m0_m1;
   // a0_inv = s0 ^ aes_mul_gf2p4(a1, b2_inv)
   //             ^ mul_a1_m0 ^ aes_mul_gf2p4(m1, b2_inv) ^ mul_m0_m1;
   //
   // Get intermediate terms. The terms they are added to depend on the same inputs.
   // Avoid aggressive synthesis optimizations.
   (* keep = "true" *) logic [3:0] mul_a0_b_inv,  mul_m0_b_inv;
   (* keep = "true" *) logic [3:0] mul_a1_b2_inv, mul_m1_b2_inv;
   assign mul_a0_b_inv  = aes_mul_gf2p4(a0, b_inv);
   assign mul_m0_b_inv  = aes_mul_gf2p4(m0, b_inv);
   assign mul_a1_b2_inv = aes_mul_gf2p4(a1, b2_inv);
   assign mul_m1_b2_inv = aes_mul_gf2p4(m1, b2_inv);
   // Generate a1_inv and a0_inv step by step.
   (* keep = "true" *) logic [3:0] a1_inv_0, a1_inv_1, a1_inv_2;
   (* keep = "true" *) logic [3:0] a0_inv_0, a0_inv_1, a0_inv_2;
   assign a1_inv_0 = s1       ^ mul_a0_b_inv;
   assign a1_inv_1 = a1_inv_0 ^ mul_a0_m1;
   assign a1_inv_2 = a1_inv_1 ^ mul_m0_b_inv;
   assign a1_inv   = a1_inv_2 ^ mul_m0_m1;
   assign a0_inv_0 = s0       ^ mul_a1_b2_inv; // Actually, s0 does not depend on a1, b2_inv.
   assign a0_inv_1 = a0_inv_0 ^ mul_a1_m0;
   assign a0_inv_2 = a0_inv_1 ^ mul_m1_b2_inv;
   assign a0_inv   = a0_inv_2 ^ mul_m0_m1;

   // Note: a_inv is now masked by s = n, a was masked by m.
   assign a_inv = {a1_inv, a0_inv};
 endmodule

 module aes_sbox_canright_masked (
   input  aes_pkg::ciph_op_e op_i,
   input  logic [7:0]        data_i, // masked, the actual input data is data_i ^ mask_i
   input  logic [7:0]        mask_i, // input mask, independent from actual input data
   input  logic [7:0]        prd_i,  // pseudo-random data for remasking, independent of input mask
   output logic [7:0]        data_o, // masked, the actual output data is data_o ^ mask_o
   output logic [7:0]        mask_o  // output mask
 );

   import aes_pkg::*;
   import aes_sbox_canright_pkg::*;

   //////////////////////////
   // Masked Canright SBox //
   //////////////////////////

   logic [7:0] in_data_basis_x, out_data_basis_x;
   logic [7:0] in_mask_basis_x, out_mask_basis_x;

   // Convert data to normal basis X.
   assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
                                                 aes_mvm(data_i ^ 8'h63, S2X);

   // For the masked Canright SBox, the output mask directly corresponds to the pseduo-random data
   // provided as input.
   assign mask_o = prd_i;

   // Convert masks to normal basis X.
   // The addition of constant 8'h63 following the affine transformation is skipped.
   assign in_mask_basis_x  = (op_i == CIPH_FWD) ? aes_mvm(mask_i, A2X) :
                                                  aes_mvm(mask_i, S2X);

   // The output mask is converted in the opposite direction.
   assign out_mask_basis_x = (op_i == CIPH_INV) ? aes_mvm(mask_o, A2X) :
                                                  aes_mvm(mask_o, S2X);

   // Do the inversion in normal basis X.
   aes_masked_inverse_gf2p8 u_aes_masked_inverse_gf2p8 (
     .a     ( in_data_basis_x  ), // input
     .m     ( in_mask_basis_x  ), // input
     .n     ( out_mask_basis_x ), // input
     .a_inv ( out_data_basis_x )  // output
   );

   // Convert to basis S or A.
   assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
                                        (aes_mvm(out_data_basis_x, X2A));

 endmodule
	// Copyright lowRISC contributors.
	// Licensed under the Apache License, Version 2.0, see LICENSE for details.
	// SPDX-License-Identifier: Apache-2.0
	//
	// AES Masked Canright SBox with Mask Re-Use
	//
	// For details, see the following paper:
	// Canright, "A very compact 'perfectly masked' S-box for AES (corrected)"
	// available at https://eprint.iacr.org/2009/011.pdf
	//
	// Note: This module implements the masked inversion algorithm with re-using masks.
	// For details, see Section 2.3 of the paper. Re-using masks may make the implementation more
	// vulnerable to higher-order differential side-channel analysis, but it remains secure against
	// first-order attacks. This implementation is commonly referred to as THE Canright Masked SBox.
	//
	// A formal analysis using REBECCA (static mode) shows that this implementation is not secure.
	// It is thus recommended to use the "noreuse" variant of the masked Canright S-Box.
	//
	// For details on the REBECCA tool, see the following paper:
	// Bloem, "Formal verification of masked hardware implementations in the presence of glitches"
	// available at https://eprint.iacr.org/2017/897.pdf

	///////////////////////////////////////////////////////////////////////////////////////////////////
	// IMPORTANT NOTE: //
	// DO NOT USE THIS FOR SYNTHESIS BLINDLY! //
	// //
	// This implementation targets primarily Xilinx Vivado synthesis as well as RTL simulation. It //
	// contains synthesis attributes specific to Xilinx Vivado to enforce the correct ordering of //
	// operations and avoid aggressive optimization. Other synthesis tools might still heavily //
	// optimize the design. The result is likely insecure. Use with care. //
	///////////////////////////////////////////////////////////////////////////////////////////////////

	// Masked inverse in GF(2^4), using normal basis [z^4, z]
	// (see Formulas 6, 13, 14, 15, 21, 22, 23, 24 in the paper)
	module aes_masked_inverse_gf2p4 (
	input logic [3:0] b,
	input logic [3:0] q,
	input logic [1:0] r,
	input logic [3:0] m1,
	output logic [3:0] b_inv
	);

	import aes_pkg::*;
	import aes_sbox_canright_pkg::*;

	logic [1:0] b1, b0, q1, q0, c, r_sq, m11, m10, b1_inv, b0_inv;
	assign b1 = b[3:2];
	assign b0 = b[1:0];
	assign q1 = q[3:2];
	assign q0 = q[1:0];
	assign m11 = m1[3:2];
	assign m10 = m1[1:0];

	// Get re-usable intermediate results.
	(* keep = "true" *) logic [1:0] mul_b0_q1, mul_b1_q0, mul_q0_q1;
	assign mul_b0_q1 = aes_mul_gf2p2(b0, q1);
	assign mul_b1_q0 = aes_mul_gf2p2(b1, q0);
	assign mul_q0_q1 = aes_mul_gf2p2(q0, q1);

	// Formula 13
	// IMPORTANT: The following ops must be executed in order (left to right):
	// c = r ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0))
	// ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0))
	// ^ aes_mul_gf2p2(b1, b0)
	// ^ mul_b1_q0 ^ mul_b0_q1 ^ mul_q0_q1;
	//
	// Get intermediate terms. The terms they are added to depend on the same inputs.
	// Avoid aggressive synthesis optimizations.
	(* keep = "true" *) logic [1:0] scale_omega2_b, scale_omega2_q;
	(* keep = "true" *) logic [1:0] mul_b1_b0;
	assign scale_omega2_b = aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0));
	assign scale_omega2_q = aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0));
	assign mul_b1_b0 = aes_mul_gf2p2(b1, b0);
	// Generate c step by step.
	(* keep = "true" *) logic [1:0] c_0, c_1, c_2, c_3, c_4;
	assign c_0 = r ^ scale_omega2_b;
	assign c_1 = c_0 ^ scale_omega2_q;
	assign c_2 = c_1 ^ mul_b1_b0;
	assign c_3 = c_2 ^ mul_b1_q0;
	assign c_4 = c_3 ^ mul_b0_q1;
	assign c = c_4 ^ mul_q0_q1;

	// Below, c1_inv and c2_inv (derived from the output of the inverse over GF(2^2)) are again
	// recombined with other inputs to c and c_inv. Aggressive synthesis optimizations across the
	// GF(2^2) inverter may result in SCA leakage and should be avoided.
	(* keep = "true" *) logic [1:0] c_inv, c1_inv, c2_inv;

	// Formulas 14 and 15
	assign c_inv = aes_square_gf2p2(c);
	assign r_sq = aes_square_gf2p2(r);

	// Re-masking c_inv
	// Formulas 21 and 23
	// IMPORTANT: First combine the masks (ops in parens) then apply to c_inv:
	// c_inv = c_inv ^ (q1 ^ r_sq);
	// c2_inv = c_inv ^ (q0 ^ q1);
	//
	// Generate c1_inv and c2_inv step by step.
	(* keep = "true" *) logic [1:0] xor_q1_r_sq, xor_q0_q1;
	assign xor_q1_r_sq = q1 ^ r_sq;
	assign xor_q0_q1 = q0 ^ q1;
	assign c1_inv = c_inv ^ xor_q1_r_sq;
	assign c2_inv = c1_inv ^ xor_q0_q1;

	// Formulas 22 and 24
	// IMPORTANT: The following ops must be executed in order (left to right):
	// b1_inv = m11 ^ aes_mul_gf2p2(b0, c1_inv)
	// ^ mul_b0_q1 ^ aes_mul_gf2p2(q0, c1_inv) ^ mul_q0_q1;
	// b0_inv = m10 ^ aes_mul_gf2p2(b1, c2_inv)
	// ^ mul_b1_q0 ^ aes_mul_gf2p2(q1, c2_inv) ^ mul_q0_q1;
	//
	// Get intermediate terms. The terms they are added to depend on the same inputs.
	// Avoid aggressive synthesis optimizations.
	(* keep = "true" *) logic [1:0] mul_b0_c1_inv, mul_q0_c1_inv;
	(* keep = "true" *) logic [1:0] mul_b1_c2_inv, mul_q1_c2_inv;
	assign mul_b0_c1_inv = aes_mul_gf2p2(b0, c1_inv);
	assign mul_q0_c1_inv = aes_mul_gf2p2(q0, c1_inv);
	assign mul_b1_c2_inv = aes_mul_gf2p2(b1, c2_inv);
	assign mul_q1_c2_inv = aes_mul_gf2p2(q1, c2_inv);
	// Generate b1_inv and b0_inv step by step.
	(* keep = "true" *) logic [1:0] b1_inv_0, b1_inv_1, b1_inv_2;
	(* keep = "true" *) logic [1:0] b0_inv_0, b0_inv_1, b0_inv_2;
	assign b1_inv_0 = m11 ^ mul_b0_c1_inv;
	assign b1_inv_1 = b1_inv_0 ^ mul_b0_q1;
	assign b1_inv_2 = b1_inv_1 ^ mul_q0_c1_inv;
	assign b1_inv = b1_inv_2 ^ mul_q0_q1;
	assign b0_inv_0 = m10 ^ mul_b1_c2_inv;
	assign b0_inv_1 = b0_inv_0 ^ mul_b1_q0;
	assign b0_inv_2 = b0_inv_1 ^ mul_q1_c2_inv;
	assign b0_inv = b0_inv_2 ^ mul_q0_q1;

	// Note: b_inv is masked by m1, b was masked by q.
	assign b_inv = {b1_inv, b0_inv};
	endmodule

	// Masked inverse in GF(2^8), using normal basis [y^16, y]
	// (see Formulas 3, 12, 25, 26 and 27 in the paper)
	module aes_masked_inverse_gf2p8 (
	input logic [7:0] a,
	input logic [7:0] m,
	input logic [7:0] n,
	output logic [7:0] a_inv
	);

	import aes_pkg::*;
	import aes_sbox_canright_pkg::*;

	logic [3:0] a1, a0, m1, m0, q, s1, s0, a1_inv, a0_inv;
	logic [1:0] r;

	// The output of the inverse over GF(2^4) and signals derived from that are again recombined
	// with inputs to the GF(2^4) inverter. Aggressive synthesis optimizations across the GF(2^4)
	// inverter may result in SCA leakage and should be avoided.
	(* keep = "true" *) logic [3:0] b, b_inv, b2_inv;

	assign a1 = a[7:4];
	assign a0 = a[3:0];
	assign m1 = m[7:4];
	assign m0 = m[3:0];

	// Get re-usable intermediate results.
	(* keep = "true" *) logic [3:0] mul_a0_m1, mul_a1_m0, mul_m0_m1;
	assign mul_a0_m1 = aes_mul_gf2p4(a0, m1);
	assign mul_a1_m0 = aes_mul_gf2p4(a1, m0);
	assign mul_m0_m1 = aes_mul_gf2p4(m0, m1);

	// q must be independent of m.
	assign q = n[7:4];

	// Formula 12
	// IMPORTANT: The following ops must be executed in order (left to right):
	// b = q ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0)
	// ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0)
	// ^ aes_mul_gf2p4(a1, a0)
	// ^ mul_a1_m0 ^ mul_a0_m1 ^ mul_m0_m1;
	//
	// Get intermediate terms. The terms they are added to depend on the same inputs.
	// Avoid aggressive synthesis optimizations.
	(* keep = "true" *) logic [3:0] mul_a1_a0;
	assign mul_a1_a0 = aes_mul_gf2p4(a1, a0);
	// Generate b step by step.
	(* keep = "true" *) logic [3:0] b_0, b_1, b_2, b_3, b_4;
	assign b_0 = q ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0); // q does not depend on a1, a0.
	assign b_1 = b_0 ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0); // b_0 does not depend on m1, m0.
	assign b_2 = b_1 ^ mul_a1_a0;
	assign b_3 = b_2 ^ mul_a1_m0;
	assign b_4 = b_3 ^ mul_a0_m1;
	assign b = b_4 ^ mul_m0_m1;

	// r must be independent of q.
	assign r = m1[3:2];

	// b is masked by q, b_inv is masked by m1.
	aes_masked_inverse_gf2p4 u_aes_masked_inverse_gf2p4 (
	.b ( b ),
	.q ( q ),
	.r ( r ),
	.m1 ( m1 ),
	.b_inv ( b_inv )
	);

	// Formula 26
	// IMPORTANT: First combine the masks (ops in parens) then apply to b_inv:
	// b2_inv = b_inv ^ (m1 ^ m0);
	//
	// Generate b2_inv step by step.
	(* keep = "true" *) logic [3:0] xor_m1_m0;
	assign xor_m1_m0 = m1 ^ m0;
	assign b2_inv = b_inv ^ xor_m1_m0;

	// s is the specified output mask n.
	assign s1 = n[7:4];
	assign s0 = n[3:0];

	// Formulas 25 and 27
	// IMPORTANT: The following ops must be executed in order (left to right):
	// a1_inv = s1 ^ aes_mul_gf2p4(a0, b_inv)
	// ^ mul_a0_m1 ^ aes_mul_gf2p4(m0, b_inv) ^ mul_m0_m1;
	// a0_inv = s0 ^ aes_mul_gf2p4(a1, b2_inv)
	// ^ mul_a1_m0 ^ aes_mul_gf2p4(m1, b2_inv) ^ mul_m0_m1;
	//
	// Get intermediate terms. The terms they are added to depend on the same inputs.
	// Avoid aggressive synthesis optimizations.
	(* keep = "true" *) logic [3:0] mul_a0_b_inv, mul_m0_b_inv;
	(* keep = "true" *) logic [3:0] mul_a1_b2_inv, mul_m1_b2_inv;
	assign mul_a0_b_inv = aes_mul_gf2p4(a0, b_inv);
	assign mul_m0_b_inv = aes_mul_gf2p4(m0, b_inv);
	assign mul_a1_b2_inv = aes_mul_gf2p4(a1, b2_inv);
	assign mul_m1_b2_inv = aes_mul_gf2p4(m1, b2_inv);
	// Generate a1_inv and a0_inv step by step.
	(* keep = "true" *) logic [3:0] a1_inv_0, a1_inv_1, a1_inv_2;
	(* keep = "true" *) logic [3:0] a0_inv_0, a0_inv_1, a0_inv_2;
	assign a1_inv_0 = s1 ^ mul_a0_b_inv;
	assign a1_inv_1 = a1_inv_0 ^ mul_a0_m1;
	assign a1_inv_2 = a1_inv_1 ^ mul_m0_b_inv;
	assign a1_inv = a1_inv_2 ^ mul_m0_m1;
	assign a0_inv_0 = s0 ^ mul_a1_b2_inv; // Actually, s0 does not depend on a1, b2_inv.
	assign a0_inv_1 = a0_inv_0 ^ mul_a1_m0;
	assign a0_inv_2 = a0_inv_1 ^ mul_m1_b2_inv;
	assign a0_inv = a0_inv_2 ^ mul_m0_m1;

	// Note: a_inv is now masked by s = n, a was masked by m.
	assign a_inv = {a1_inv, a0_inv};
	endmodule

	module aes_sbox_canright_masked (
	input aes_pkg::ciph_op_e op_i,
	input logic [7:0] data_i, // masked, the actual input data is data_i ^ mask_i
	input logic [7:0] mask_i, // input mask, independent from actual input data
	input logic [7:0] prd_i, // pseudo-random data for remasking, independent of input mask
	output logic [7:0] data_o, // masked, the actual output data is data_o ^ mask_o
	output logic [7:0] mask_o // output mask
	);

	import aes_pkg::*;
	import aes_sbox_canright_pkg::*;

	//////////////////////////
	// Masked Canright SBox //
	//////////////////////////

	logic [7:0] in_data_basis_x, out_data_basis_x;
	logic [7:0] in_mask_basis_x, out_mask_basis_x;

	// Convert data to normal basis X.
	assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
	aes_mvm(data_i ^ 8'h63, S2X);

	// For the masked Canright SBox, the output mask directly corresponds to the pseduo-random data
	// provided as input.
	assign mask_o = prd_i;

	// Convert masks to normal basis X.
	// The addition of constant 8'h63 following the affine transformation is skipped.
	assign in_mask_basis_x = (op_i == CIPH_FWD) ? aes_mvm(mask_i, A2X) :
	aes_mvm(mask_i, S2X);

	// The output mask is converted in the opposite direction.
	assign out_mask_basis_x = (op_i == CIPH_INV) ? aes_mvm(mask_o, A2X) :
	aes_mvm(mask_o, S2X);

	// Do the inversion in normal basis X.
	aes_masked_inverse_gf2p8 u_aes_masked_inverse_gf2p8 (
	.a ( in_data_basis_x ), // input
	.m ( in_mask_basis_x ), // input
	.n ( out_mask_basis_x ), // input
	.a_inv ( out_data_basis_x ) // output
	);

	// Convert to basis S or A.
	assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
	(aes_mvm(out_data_basis_x, X2A));

	endmodule