[aes] Rework masked Canright S-Box implementations for FPGA synthesis

This commit reworks the masked S-Box implementations and adds required
synthesis attributes for Xilinx Vivado. Other synthesis tools need to be
instructed to translate these attributes accordingly.

Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/ip/aes/lint/aes.vlt b/hw/ip/aes/lint/aes.vlt
index 7c72b09..004c2ba 100644
--- a/hw/ip/aes/lint/aes.vlt
+++ b/hw/ip/aes/lint/aes.vlt
@@ -9,3 +9,6 @@
 // Always_comb variable driven after use: 'regular'
 // regular is assigned in a for loop, regular[1] depends on regular[0]
 lint_off -rule ALWCOMBORDER -file "*/rtl/aes_key_expand.sv" -match "*'regular'"
+
+// Masked SBox implementations may require multiple modules to prevent aggressive synthesis optimizations.
+lint_off -rule DECLFILENAME -file "*/rtl/aes_sbox_*_masked*.sv" -match "Filename 'aes_sbox_*_masked*' does not match MODULE name: *"
diff --git a/hw/ip/aes/rtl/aes_sbox_canright_masked.sv b/hw/ip/aes/rtl/aes_sbox_canright_masked.sv
index 5c9ca27..1d0502c 100644
--- a/hw/ip/aes/rtl/aes_sbox_canright_masked.sv
+++ b/hw/ip/aes/rtl/aes_sbox_canright_masked.sv
@@ -17,10 +17,225 @@
 // IMPORTANT NOTE:                                                                               //
 //                            DO NOT USE THIS FOR SYNTHESIS BLINDLY!                             //
 //                                                                                               //
-// This is a high-level implementation targeting primarily RTL simulation. Synthesis tools might //
-// heavily optimize the design. The result is likely insecure. Use with care.                    //
+// This implementation targets primarily Xilinx Vivado synthesis as well as RTL simulation. It   //
+// contains synthesis attributes specific to Xilinx Vivado to enforce the correct ordering of    //
+// operations and avoid aggressive optimization. Other synthesis tools might still heavily       //
+// optimize the design. The result is likely insecure. Use with care.                            //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Masked inverse in GF(2^4), using normal basis [z^4, z]
+// (see Formulas 6, 13, 14, 15, 21, 22, 23, 24 in the paper)
+module aes_masked_inverse_gf2p4 (
+  input  logic [3:0] b,
+  input  logic [3:0] q,
+  input  logic [1:0] r,
+  input  logic [3:0] m1,
+  output logic [3:0] b_inv
+);
+
+  import aes_pkg::*;
+  import aes_sbox_canright_pkg::*;
+
+  logic [1:0] b1, b0, q1, q0, c, r_sq, m11, m10, b1_inv, b0_inv;
+  assign b1  = b[3:2];
+  assign b0  = b[1:0];
+  assign q1  = q[3:2];
+  assign q0  = q[1:0];
+  assign m11 = m1[3:2];
+  assign m10 = m1[1:0];
+
+  // Get re-usable intermediate results.
+  (* keep = "true" *) logic [1:0] mul_b0_q1, mul_b1_q0, mul_q0_q1;
+  assign mul_b0_q1 = aes_mul_gf2p2(b0, q1);
+  assign mul_b1_q0 = aes_mul_gf2p2(b1, q0);
+  assign mul_q0_q1 = aes_mul_gf2p2(q0, q1);
+
+  // Formula 13
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // c = r ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0))
+  //       ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0))
+  //       ^ aes_mul_gf2p2(b1, b0)
+  //       ^ mul_b1_q0 ^ mul_b0_q1 ^ mul_q0_q1;
+  //
+  // Get intermediate terms. The terms they are added to depend on the same inputs.
+  // Avoid aggressive synthesis optimizations.
+  (* keep = "true" *) logic [1:0] scale_omega2_b, scale_omega2_q;
+  (* keep = "true" *) logic [1:0] mul_b1_b0;
+  assign scale_omega2_b = aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0));
+  assign scale_omega2_q = aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0));
+  assign mul_b1_b0 = aes_mul_gf2p2(b1, b0);
+  // Generate c step by step.
+  (* keep = "true" *) logic [1:0] c_0, c_1, c_2, c_3, c_4;
+  assign c_0 = r   ^ scale_omega2_b;
+  assign c_1 = c_0 ^ scale_omega2_q;
+  assign c_2 = c_1 ^ mul_b1_b0;
+  assign c_3 = c_2 ^ mul_b1_q0;
+  assign c_4 = c_3 ^ mul_b0_q1;
+  assign c   = c_4 ^ mul_q0_q1;
+
+  // Below, c1_inv and c2_inv (derived from the output of the inverse over GF(2^2)) are again
+  // recombined with other inputs to c and c_inv. Aggressive synthesis optimizations across the
+  // GF(2^2) inverter may result in SCA leakage and should be avoided.
+  (* keep = "true" *) logic [1:0] c_inv, c1_inv, c2_inv;
+
+  // Formulas 14 and 15
+  assign c_inv = aes_square_gf2p2(c);
+  assign r_sq  = aes_square_gf2p2(r);
+
+  // Re-masking c_inv
+  // Formulas 21 and 23
+  // IMPORTANT: First combine the masks (ops in parens) then apply to c_inv:
+  // c_inv  = c_inv ^ (q1 ^ r_sq);
+  // c2_inv = c_inv ^ (q0 ^ q1);
+  //
+  // Generate c1_inv and c2_inv step by step.
+  (* keep = "true" *) logic [1:0] xor_q1_r_sq, xor_q0_q1;
+  assign xor_q1_r_sq = q1 ^ r_sq;
+  assign xor_q0_q1   = q0 ^ q1;
+  assign c1_inv      = c_inv  ^ xor_q1_r_sq;
+  assign c2_inv      = c1_inv ^ xor_q0_q1;
+
+  // Formulas 22 and 24
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // b1_inv = m11 ^ aes_mul_gf2p2(b0, c1_inv)
+  //              ^ mul_b0_q1 ^ aes_mul_gf2p2(q0, c1_inv) ^ mul_q0_q1;
+  // b0_inv = m10 ^ aes_mul_gf2p2(b1, c2_inv)
+  //              ^ mul_b1_q0 ^ aes_mul_gf2p2(q1, c2_inv) ^ mul_q0_q1;
+  //
+  // Get intermediate terms. The terms they are added to depend on the same inputs.
+  // Avoid aggressive synthesis optimizations.
+  (* keep = "true" *) logic [1:0] mul_b0_c1_inv, mul_q0_c1_inv;
+  (* keep = "true" *) logic [1:0] mul_b1_c2_inv, mul_q1_c2_inv;
+  assign mul_b0_c1_inv = aes_mul_gf2p2(b0, c1_inv);
+  assign mul_q0_c1_inv = aes_mul_gf2p2(q0, c1_inv);
+  assign mul_b1_c2_inv = aes_mul_gf2p2(b1, c2_inv);
+  assign mul_q1_c2_inv = aes_mul_gf2p2(q1, c2_inv);
+  // Generate b1_inv and b0_inv step by step.
+  (* keep = "true" *) logic [1:0] b1_inv_0, b1_inv_1, b1_inv_2;
+  (* keep = "true" *) logic [1:0] b0_inv_0, b0_inv_1, b0_inv_2;
+  assign b1_inv_0 = m11      ^ mul_b0_c1_inv;
+  assign b1_inv_1 = b1_inv_0 ^ mul_b0_q1;
+  assign b1_inv_2 = b1_inv_1 ^ mul_q0_c1_inv;
+  assign b1_inv   = b1_inv_2 ^ mul_q0_q1;
+  assign b0_inv_0 = m10      ^ mul_b1_c2_inv;
+  assign b0_inv_1 = b0_inv_0 ^ mul_b1_q0;
+  assign b0_inv_2 = b0_inv_1 ^ mul_q1_c2_inv;
+  assign b0_inv   = b0_inv_2 ^ mul_q0_q1;
+
+  // Note: b_inv is masked by m1, b was masked by q.
+  assign b_inv = {b1_inv, b0_inv};
+endmodule
+
+// Masked inverse in GF(2^8), using normal basis [y^16, y]
+// (see Formulas 3, 12, 25, 26 and 27 in the paper)
+module aes_masked_inverse_gf2p8 (
+  input  logic [7:0] a,
+  input  logic [7:0] m,
+  input  logic [7:0] n,
+  output logic [7:0] a_inv
+);
+
+  import aes_pkg::*;
+  import aes_sbox_canright_pkg::*;
+
+  logic [3:0] a1, a0, m1, m0, q, s1, s0, a1_inv, a0_inv;
+  logic [1:0] r;
+
+  // The output of the inverse over GF(2^4) and signals derived from that are again recombined
+  // with inputs to the GF(2^4) inverter. Aggressive synthesis optimizations across the GF(2^4)
+  // inverter may result in SCA leakage and should be avoided.
+  (* keep = "true" *) logic [3:0] b, b_inv, b2_inv;
+
+  assign a1 = a[7:4];
+  assign a0 = a[3:0];
+  assign m1 = m[7:4];
+  assign m0 = m[3:0];
+
+  // Get re-usable intermediate results.
+  (* keep = "true" *) logic [3:0] mul_a0_m1, mul_a1_m0, mul_m0_m1;
+  assign mul_a0_m1 = aes_mul_gf2p4(a0, m1);
+  assign mul_a1_m0 = aes_mul_gf2p4(a1, m0);
+  assign mul_m0_m1 = aes_mul_gf2p4(m0, m1);
+
+  // q must be independent of m.
+  assign q = n[7:4];
+
+  // Formula 12
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // b = q ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0)
+  //       ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0)
+  //       ^ aes_mul_gf2p4(a1, a0)
+  //       ^ mul_a1_m0 ^ mul_a0_m1 ^ mul_m0_m1;
+  //
+  // Get intermediate terms. The terms they are added to depend on the same inputs.
+  // Avoid aggressive synthesis optimizations.
+  (* keep = "true" *) logic [3:0] mul_a1_a0;
+  assign mul_a1_a0 = aes_mul_gf2p4(a1, a0);
+  // Generate b step by step.
+  (* keep = "true" *) logic [3:0] b_0, b_1, b_2, b_3, b_4;
+  assign b_0 = q   ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0); // q does not depend on a1, a0.
+  assign b_1 = b_0 ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0); // b_0 does not depend on m1, m0.
+  assign b_2 = b_1 ^ mul_a1_a0;
+  assign b_3 = b_2 ^ mul_a1_m0;
+  assign b_4 = b_3 ^ mul_a0_m1;
+  assign b   = b_4 ^ mul_m0_m1;
+
+  // r must be independent of q.
+  assign r = m1[3:2];
+
+  // b is masked by q, b_inv is masked by m1.
+  aes_masked_inverse_gf2p4 aes_masked_inverse_gf2p4 (
+    .b     ( b     ),
+    .q     ( q     ),
+    .r     ( r     ),
+    .m1    ( m1    ),
+    .b_inv ( b_inv )
+  );
+
+  // Formula 26
+  // IMPORTANT: First combine the masks (ops in parens) then apply to b_inv:
+  // b2_inv = b_inv ^ (m1 ^ m0);
+  //
+  // Generate b2_inv step by step.
+  (* keep = "true" *) logic [3:0] xor_m1_m0;
+  assign xor_m1_m0 = m1 ^ m0;
+  assign b2_inv    = b_inv ^ xor_m1_m0;
+
+  // s is the specified output mask n.
+  assign s1 = n[7:4];
+  assign s0 = n[3:0];
+
+  // Formulas 25 and 27
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // a1_inv = s1 ^ aes_mul_gf2p4(a0, b_inv)
+  //             ^ mul_a0_m1 ^ aes_mul_gf2p4(m0, b_inv)  ^ mul_m0_m1;
+  // a0_inv = s0 ^ aes_mul_gf2p4(a1, b2_inv)
+  //             ^ mul_a1_m0 ^ aes_mul_gf2p4(m1, b2_inv) ^ mul_m0_m1;
+  //
+  // Get intermediate terms. The terms they are added to depend on the same inputs.
+  // Avoid aggressive synthesis optimizations.
+  (* keep = "true" *) logic [3:0] mul_a0_b_inv,  mul_m0_b_inv;
+  (* keep = "true" *) logic [3:0] mul_a1_b2_inv, mul_m1_b2_inv;
+  assign mul_a0_b_inv  = aes_mul_gf2p4(a0, b_inv);
+  assign mul_m0_b_inv  = aes_mul_gf2p4(m0, b_inv);
+  assign mul_a1_b2_inv = aes_mul_gf2p4(a1, b2_inv);
+  assign mul_m1_b2_inv = aes_mul_gf2p4(m1, b2_inv);
+  // Generate a1_inv and a0_inv step by step.
+  (* keep = "true" *) logic [3:0] a1_inv_0, a1_inv_1, a1_inv_2;
+  (* keep = "true" *) logic [3:0] a0_inv_0, a0_inv_1, a0_inv_2;
+  assign a1_inv_0 = s1       ^ mul_a0_b_inv;
+  assign a1_inv_1 = a1_inv_0 ^ mul_a0_m1;
+  assign a1_inv_2 = a1_inv_1 ^ mul_m0_b_inv;
+  assign a1_inv   = a1_inv_2 ^ mul_m0_m1;
+  assign a0_inv_0 = s0       ^ mul_a1_b2_inv; // Actually, s0 does not depend on a1, b2_inv.
+  assign a0_inv_1 = a0_inv_0 ^ mul_a1_m0;
+  assign a0_inv_2 = a0_inv_1 ^ mul_m1_b2_inv;
+  assign a0_inv   = a0_inv_2 ^ mul_m0_m1;
+
+  // Note: a_inv is now masked by s = n, a was masked by m.
+  assign a_inv = {a1_inv, a0_inv};
+endmodule
+
 module aes_sbox_canright_masked (
   input  aes_pkg::ciph_op_e op_i,
   input  logic [7:0]        data_i,     // masked, the actual input data is data_i ^ in_mask_i
@@ -32,134 +247,16 @@
   import aes_pkg::*;
   import aes_sbox_canright_pkg::*;
 
-  ///////////////
-  // Functions //
-  ///////////////
-
-  // Masked inverse in GF(2^4), using normal basis [z^4, z]
-  // (see Formulas 6, 13, 14, 15, 21, 22, 23, 24 in the paper)
-  function automatic logic [3:0] aes_masked_inverse_gf2p4(logic [3:0] b,
-                                                          logic [3:0] q,
-                                                          logic [1:0] r,
-                                                          logic [3:0] m1);
-    logic [3:0] b_inv;
-    logic [1:0] b1, b0, q1, q0, c, c_inv, c2_inv, r_sq, m11, m10, b1_inv, b0_inv;
-    logic [1:0] mul_b0_q1, mul_b1_q0, mul_q0_q1;
-    b1  = b[3:2];
-    b0  = b[1:0];
-    q1  = q[3:2];
-    q0  = q[1:0];
-    m11 = m1[3:2];
-    m10 = m1[1:0];
-
-    // Get re-usable intermediate results.
-    mul_b0_q1 = aes_mul_gf2p2(b0, q1);
-    mul_b1_q0 = aes_mul_gf2p2(b1, q0);
-    mul_q0_q1 = aes_mul_gf2p2(q0, q1);
-
-    // Formula 13
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    c = r ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0))
-          ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0))
-          ^ aes_mul_gf2p2(b1, b0)
-          ^ aes_mul_gf2p2(b1, q0) ^ mul_b0_q1 ^ mul_q0_q1;
-    //
-
-    // Formulas 14 and 15
-    c_inv = aes_square_gf2p2(c);
-    r_sq  = aes_square_gf2p2(r);
-
-    // Re-masking c_inv
-    // Formulas 21 and 23
-    // IMPORTANT: First combine the masks (ops in parens) then apply to c_inv:
-    c_inv  = c_inv ^ (q1 ^ r_sq);
-    c2_inv = c_inv ^ (q0 ^ q1);
-    //
-
-    // Formulas 22 and 24
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    b1_inv = m11 ^ aes_mul_gf2p2(b0, c_inv)
-                 ^ mul_b0_q1 ^ aes_mul_gf2p2(q0, c_inv)  ^ mul_q0_q1;
-    b0_inv = m10 ^ aes_mul_gf2p2(b1, c2_inv)
-                 ^ mul_b1_q0 ^ aes_mul_gf2p2(q1, c2_inv) ^ mul_q0_q1;
-    //
-
-    // Note: b_inv is masked by m1, b was masked by q.
-    b_inv = {b1_inv, b0_inv};
-
-    return b_inv;
-  endfunction
-
-  // Masked inverse in GF(2^8), using normal basis [y^16, y]
-  // (see Formulas 3, 12, 25, 26 and 27 in the paper)
-  function automatic logic [7:0] aes_masked_inverse_gf2p8(logic [7:0] a,
-                                                          logic [7:0] m,
-                                                          logic [7:0] n);
-    logic [7:0] a_inv;
-    logic [3:0] a1, a0, m1, m0, b, b_inv, b2_inv, q, s1, s0, a1_inv, a0_inv;
-    logic [3:0] mul_a0_m1, mul_a1_m0, mul_m0_m1;
-    logic [1:0] r;
-    a1 = a[7:4];
-    a0 = a[3:0];
-    m1 = m[7:4];
-    m0 = m[3:0];
-
-    // Get re-usable intermediate results.
-    mul_a0_m1 = aes_mul_gf2p4(a0, m1);
-    mul_a1_m0 = aes_mul_gf2p4(a1, m0);
-    mul_m0_m1 = aes_mul_gf2p4(m0, m1);
-
-    // q must be independent of m.
-    q = n[7:4];
-
-    // Formula 12
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    b = q ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0)
-          ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0)
-          ^ aes_mul_gf2p4(a1, a0)
-          ^ mul_a1_m0 ^ mul_a0_m1 ^ mul_m0_m1;
-    //
-
-    // r must be independent of q.
-    r = m1[3:2];
-
-    // b is masked by q, b_inv is masked by m1.
-    b_inv = aes_masked_inverse_gf2p4(b, q, r, m1);
-
-    // Formula 26
-    // IMPORTANT: First combine the masks (ops in parens) then apply to b_inv:
-    b2_inv = b_inv ^ (m1 ^ m0);
-    //
-
-    // s is the specified output mask n.
-    s1 = n[7:4];
-    s0 = n[3:0];
-
-    // Formulas 25 and 27
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    a1_inv = s1 ^ aes_mul_gf2p4(a0, b_inv)
-                ^ mul_a0_m1 ^ aes_mul_gf2p4(m0, b_inv)  ^ mul_m0_m1;
-    a0_inv = s0 ^ aes_mul_gf2p4(a1, b2_inv)
-                ^ mul_a1_m0 ^ aes_mul_gf2p4(m1, b2_inv) ^ mul_m0_m1;
-    //
-
-    // Note: a_inv is now masked by s = n, a was masked by m.
-    a_inv = {a1_inv, a0_inv};
-
-    return a_inv;
-  endfunction
-
   //////////////////////////
   // Masked Canright SBox //
   //////////////////////////
 
-  logic [7:0] data_basis_x, data_inverse;
-  logic [7:0] in_mask_basis_x;
-  logic [7:0] out_mask_basis_x;
+  logic [7:0] in_data_basis_x, out_data_basis_x;
+  logic [7:0] in_mask_basis_x, out_mask_basis_x;
 
   // Convert data to normal basis X.
-  assign data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
-                                             aes_mvm(data_i ^ 8'h63, S2X);
+  assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
+                                                aes_mvm(data_i ^ 8'h63, S2X);
 
   // Convert masks to normal basis X.
   // The addition of constant 8'h63 following the affine transformation is skipped.
@@ -171,10 +268,15 @@
                                                  aes_mvm(out_mask_i, S2X);
 
   // Do the inversion in normal basis X.
-  assign data_inverse = aes_masked_inverse_gf2p8(data_basis_x, in_mask_basis_x, out_mask_basis_x);
+  aes_masked_inverse_gf2p8 aes_masked_inverse_gf2p8 (
+    .a     ( in_data_basis_x  ), // input
+    .m     ( in_mask_basis_x  ), // input
+    .n     ( out_mask_basis_x ), // input
+    .a_inv ( out_data_basis_x )  // output
+  );
 
   // Convert to basis S or A.
-  assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(data_inverse, X2S) ^ 8'h63) :
-                                       (aes_mvm(data_inverse, X2A));
+  assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
+                                       (aes_mvm(out_data_basis_x, X2A));
 
 endmodule
diff --git a/hw/ip/aes/rtl/aes_sbox_canright_masked_noreuse.sv b/hw/ip/aes/rtl/aes_sbox_canright_masked_noreuse.sv
index d6195a9..b5eb6d9 100644
--- a/hw/ip/aes/rtl/aes_sbox_canright_masked_noreuse.sv
+++ b/hw/ip/aes/rtl/aes_sbox_canright_masked_noreuse.sv
@@ -15,10 +15,228 @@
 // IMPORTANT NOTE:                                                                               //
 //                            DO NOT USE THIS FOR SYNTHESIS BLINDLY!                             //
 //                                                                                               //
-// This is a high-level implementation targeting primarily RTL simulation. Synthesis tools might //
-// heavily optimize the design. The result is likely insecure. Use with care.                    //
+// This implementation targets primarily Xilinx Vivado synthesis as well as RTL simulation. It   //
+// contains synthesis attributes specific to Xilinx Vivado to enforce the correct ordering of    //
+// operations and avoid aggressive optimization. Other synthesis tools might still heavily       //
+// optimize the design. The result is likely insecure. Use with care.                            //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Masked inverse in GF(2^4), using normal basis [z^4, z]
+// (see Formulas 6, 13, 14, 15, 16, 17 in the paper)
+module aes_masked_inverse_gf2p4_noreuse (
+  input  logic [3:0] b,
+  input  logic [3:0] q,
+  input  logic [1:0] r,
+  input  logic [3:0] t,
+  output logic [3:0] b_inv
+);
+
+  import aes_pkg::*;
+  import aes_sbox_canright_pkg::*;
+
+  logic [1:0] b1, b0, q1, q0, c, c_inv, r_sq, t1, t0, b1_inv, b0_inv;
+  assign b1 = b[3:2];
+  assign b0 = b[1:0];
+  assign q1 = q[3:2];
+  assign q0 = q[1:0];
+  assign t1 = t[3:2];
+  assign t0 = t[1:0];
+
+  // Formula 13
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // c = r ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0))
+  //       ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0))
+  //       ^ aes_mul_gf2p2(b1, b0)
+  //       ^ aes_mul_gf2p2(b1, q0) ^ aes_mul_gf2p2(b0, q1) ^ aes_mul_gf2p2(q1, q0);
+  //
+  // Get intermediate terms. The terms they are added to depend on the same inputs.
+  // Avoid aggressive synthesis optimizations.
+  (* keep = "true" *) logic [1:0] scale_omega2_b, scale_omega2_q;
+  (* keep = "true" *) logic [1:0] mul_b1_b0, mul_b1_q0, mul_b0_q1, mul_q1_q0;
+  assign scale_omega2_b = aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0));
+  assign scale_omega2_q = aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0));
+  assign mul_b1_b0 = aes_mul_gf2p2(b1, b0);
+  assign mul_b1_q0 = aes_mul_gf2p2(b1, q0);
+  assign mul_b0_q1 = aes_mul_gf2p2(b0, q1);
+  assign mul_q1_q0 = aes_mul_gf2p2(q1, q0);
+
+  // Generate c step by step.
+  (* keep = "true" *) logic [1:0] c_0, c_1, c_2, c_3, c_4;
+  assign c_0 = r   ^ scale_omega2_b;
+  assign c_1 = c_0 ^ scale_omega2_q;
+  assign c_2 = c_1 ^ mul_b1_b0;
+  assign c_3 = c_2 ^ mul_b1_q0;
+  assign c_4 = c_3 ^ mul_b0_q1;
+  assign c   = c_4 ^ mul_q1_q0;
+
+  // Formulas 14 and 15
+  assign c_inv = aes_square_gf2p2(c);
+  assign r_sq  = aes_square_gf2p2(r);
+
+  // Formulas 16 and 17
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // b1_inv = t1 ^ aes_mul_gf2p2(b0, c_inv)
+  //             ^ aes_mul_gf2p2(b0, r_sq) ^ aes_mul_gf2p2(q0, c_inv) ^ aes_mul_gf2p2(q0, r_sq);
+  // b0_inv = t0 ^ aes_mul_gf2p2(b1, c_inv)
+  //             ^ aes_mul_gf2p2(b1, r_sq) ^ aes_mul_gf2p2(q1, c_inv) ^ aes_mul_gf2p2(q1, r_sq);
+  //
+  // Get intermediate terms. The terms they are added to depend on the same inputs.
+  // Avoid aggressive synthesis optimizations.
+  (* keep = "true" *) logic [1:0] mul_b0_r_sq, mul_q0_c_inv, mul_q0_r_sq;
+  (* keep = "true" *) logic [1:0] mul_b1_r_sq, mul_q1_c_inv, mul_q1_r_sq;
+  assign mul_b0_r_sq  = aes_mul_gf2p2(b0, r_sq);
+  assign mul_q0_c_inv = aes_mul_gf2p2(q0, c_inv);
+  assign mul_q0_r_sq  = aes_mul_gf2p2(q0, r_sq);
+  assign mul_b1_r_sq  = aes_mul_gf2p2(b1, r_sq);
+  assign mul_q1_c_inv = aes_mul_gf2p2(q1, c_inv);
+  assign mul_q1_r_sq  = aes_mul_gf2p2(q1, r_sq);
+
+  // Generate b1_inv and b0_inv step by step.
+  (* keep = "true" *) logic [1:0] b1_inv_0, b1_inv_1, b1_inv_2;
+  (* keep = "true" *) logic [1:0] b0_inv_0, b0_inv_1, b0_inv_2;
+  assign b1_inv_0 = t1       ^ aes_mul_gf2p2(b0, c_inv); // t1 does not depend on b0, c_inv.
+  assign b1_inv_1 = b1_inv_0 ^ mul_b0_r_sq;
+  assign b1_inv_2 = b1_inv_1 ^ mul_q0_c_inv;
+  assign b1_inv   = b1_inv_2 ^ mul_q0_r_sq;
+  assign b0_inv_0 = t0       ^ aes_mul_gf2p2(b1, c_inv); // t0 does not depend on b1, c_inv.
+  assign b0_inv_1 = b0_inv_0 ^ mul_b1_r_sq;
+  assign b0_inv_2 = b0_inv_1 ^ mul_q1_c_inv;
+  assign b0_inv   = b0_inv_2 ^ mul_q1_r_sq;
+
+  // Note: b_inv is masked by t, b was masked by q.
+  assign b_inv = {b1_inv, b0_inv};
+
+endmodule
+
+// Masked inverse in GF(2^8), using normal basis [y^16, y]
+// (see Formulas 3, 12, 18 and 19 in the paper)
+module aes_masked_inverse_gf2p8_noreuse (
+  input  logic [7:0] a,
+  input  logic [7:0] m,
+  input  logic [7:0] n,
+  output logic [7:0] a_inv
+);
+
+  import aes_pkg::*;
+  import aes_sbox_canright_pkg::*;
+
+  logic [3:0] a1, a0, m1, m0, q, s1, s0, t, a1_inv, a0_inv;
+  logic [1:0] r;
+
+  // The output of the inverse over GF(2^4) and signals derived from that are again recombined
+  // with inputs to the GF(2^4) inverter. Aggressive synthesis optimizations across the GF(2^4)
+  // inverter may result in SCA leakage and should be avoided.
+  (* keep = "true" *) logic [3:0] b, b_inv;
+
+  assign a1 = a[7:4];
+  assign a0 = a[3:0];
+  assign m1 = m[7:4];
+  assign m0 = m[3:0];
+
+  // q must be independent of m.
+  assign q = n[7:4];
+
+  // Formula 12
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // b = q ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0)
+  //       ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0)
+  //       ^ aes_mul_gf2p4(a1, a0)
+  //       ^ aes_mul_gf2p4(a1, m0) ^ aes_mul_gf2p4(a0, m1) ^ aes_mul_gf2p4(m0, m1);
+  //
+  // Get intermediate terms. The terms they are added to depend on the same inputs.
+  // Avoid aggressive synthesis optimizations.
+  (* keep = "true" *) logic [3:0] mul_a1_a0, mul_a1_m0, mul_a0_m1, mul_m0_m1;
+  assign mul_a1_a0 = aes_mul_gf2p4(a1, a0);
+  assign mul_a1_m0 = aes_mul_gf2p4(a1, m0);
+  assign mul_a0_m1 = aes_mul_gf2p4(a0, m1);
+  assign mul_m0_m1 = aes_mul_gf2p4(m0, m1);
+  // Generate b step by step.
+  (* keep = "true" *) logic [3:0] b_0, b_1, b_2, b_3, b_4;
+  assign b_0 = q   ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0); // q does not depend on a1, a0.
+  assign b_1 = b_0 ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0); // b_0 does not depend on m1, m0.
+  assign b_2 = b_1 ^ mul_a1_a0;
+  assign b_3 = b_2 ^ mul_a1_m0;
+  assign b_4 = b_3 ^ mul_a0_m1;
+  assign b   = b_4 ^ mul_m0_m1;
+
+  // r must be independent of q.
+  assign r = m1[3:2];
+
+  // Note that the paper states the following requirements on t:
+  // - t must be independent of r.
+  // - t1 must be independent of q0, t0 must be independent of q1.
+  // - t must be independent of m (for the final steps involving s)
+  // The paper suggests to use t = q. To select s = n for the output mask (s must be independent
+  // of t = q = n[7:4]), we would need t = m0 or similar (not r, m1[3:2] though), but this would
+  // break the random product distribution of aes_mul_gf2p4(m0, t), or aes_mul_gf2p4(m1, t) below
+  // (see Lemma 2 in the paper). For this reason, we select t = q here and apply a final mask
+  // switch from s = m to n after the inversion.
+  assign t = q;
+
+  // b is masked by q, b_inv is masked by t.
+  aes_masked_inverse_gf2p4_noreuse aes_masked_inverse_gf2p4 (
+    .b     ( b     ),
+    .q     ( q     ),
+    .r     ( r     ),
+    .t     ( t     ),
+    .b_inv ( b_inv )
+  );
+
+  // Note that the paper states the following requirements on s:
+  // - s must be independent of t
+  // - s1 must be independent of m0, s0 must be independent of m1.
+  // The paper suggests to use s = m (the input mask). To still end up with the specified output
+  // mask n, we will apply a final mask switch after the inversion.
+  assign s1 = m1;
+  assign s0 = m0;
+
+  // Formulas 18 and 19
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // a1_inv = s1 ^ aes_mul_gf2p4(a0, b_inv)
+  //             ^ aes_mul_gf2p4(a0, t) ^ aes_mul_gf2p4(m0, b_inv) ^ aes_mul_gf2p4(m0, t);
+  // a0_inv = s0 ^ aes_mul_gf2p4(a1, b_inv)
+  //             ^ aes_mul_gf2p4(a1, t) ^ aes_mul_gf2p4(m1, b_inv) ^ aes_mul_gf2p4(m1, t);
+  //
+  // Get intermediate terms. The terms they are added to depend on the same inputs.
+  // Avoid aggressive synthesis optimizations.
+  (* keep = "true" *) logic [3:0] mul_a0_b_inv, mul_a0_t, mul_m0_b_inv, mul_m0_t;
+  (* keep = "true" *) logic [3:0] mul_a1_b_inv, mul_a1_t, mul_m1_b_inv, mul_m1_t;
+  assign mul_a0_b_inv = aes_mul_gf2p4(a0, b_inv);
+  assign mul_a0_t     = aes_mul_gf2p4(a0, t);
+  assign mul_m0_b_inv = aes_mul_gf2p4(m0, b_inv);
+  assign mul_m0_t     = aes_mul_gf2p4(m0, t);
+  assign mul_a1_b_inv = aes_mul_gf2p4(a1, b_inv);
+  assign mul_a1_t     = aes_mul_gf2p4(a1, t);
+  assign mul_m1_b_inv = aes_mul_gf2p4(m1, b_inv);
+  assign mul_m1_t     = aes_mul_gf2p4(m1, t);
+
+  // Generate a1_inv and a0_inv step by step.
+  (* keep = "true" *) logic [3:0] a1_inv_0, a1_inv_1, a1_inv_2;
+  (* keep = "true" *) logic [3:0] a0_inv_0, a0_inv_1, a0_inv_2;
+  assign a1_inv_0 = s1       ^ mul_a0_b_inv;
+  assign a1_inv_1 = a1_inv_0 ^ mul_a0_t;
+  assign a1_inv_2 = a1_inv_1 ^ mul_m0_b_inv;
+  assign a1_inv   = a1_inv_2 ^ mul_m0_t;
+  assign a0_inv_0 = s0       ^ mul_a1_b_inv;
+  assign a0_inv_1 = a0_inv_0 ^ mul_a1_t;
+  assign a0_inv_2 = a0_inv_1 ^ mul_m1_b_inv;
+  assign a0_inv   = a0_inv_2 ^ mul_m1_t;
+
+  // Note: a_inv is now masked by s = m, a was masked by m.
+  (* keep = "true" *) logic [7:0] a_inv_0;
+  assign a_inv_0 = {a1_inv, a0_inv};
+
+  // To have a_inv masked by n (the specified output mask), we perform a final mask switch.
+  // IMPORTANT: The following ops must be executed in order (left to right):
+  // a_inv = a_inv ^ n ^ m;
+  //
+  // Generate a_inv step by step.
+  (* keep = "true" *) logic [7:0] a_inv_1;
+  assign a_inv_1 = a_inv_0 ^ n;
+  assign a_inv   = a_inv_1 ^ m;
+
+endmodule
+
 module aes_sbox_canright_masked_noreuse (
   input  aes_pkg::ciph_op_e op_i,
   input  logic [7:0]        data_i,     // masked, the actual input data is data_i ^ in_mask_i
@@ -30,130 +248,16 @@
   import aes_pkg::*;
   import aes_sbox_canright_pkg::*;
 
-  ///////////////
-  // Functions //
-  ///////////////
-
-  // Masked inverse in GF(2^4), using normal basis [z^4, z]
-  // (see Formulas 6, 13, 14, 15, 16, 17 in the paper)
-  function automatic logic [3:0] aes_masked_inverse_gf2p4(logic [3:0] b,
-                                                          logic [3:0] q,
-                                                          logic [1:0] r,
-                                                          logic [3:0] t);
-    logic [3:0] b_inv;
-    logic [1:0] b1, b0, q1, q0, c, c_inv, r_sq, t1, t0, b1_inv, b0_inv;
-    b1 = b[3:2];
-    b0 = b[1:0];
-    q1 = q[3:2];
-    q0 = q[1:0];
-    t1 = t[3:2];
-    t0 = t[1:0];
-
-    // Formula 13
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    c = r ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(b1 ^ b0))
-          ^ aes_scale_omega2_gf2p2(aes_square_gf2p2(q1 ^ q0))
-          ^ aes_mul_gf2p2(b1, b0)
-          ^ aes_mul_gf2p2(b1, q0) ^ aes_mul_gf2p2(b0, q1) ^ aes_mul_gf2p2(q1, q0);
-    //
-
-    // Formulas 14 and 15
-    c_inv = aes_square_gf2p2(c);
-    r_sq  = aes_square_gf2p2(r);
-
-    // Formulas 16 and 17
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    b1_inv = t1 ^ aes_mul_gf2p2(b0, c_inv)
-                ^ aes_mul_gf2p2(b0, r_sq) ^ aes_mul_gf2p2(q0, c_inv) ^ aes_mul_gf2p2(q0, r_sq);
-    b0_inv = t0 ^ aes_mul_gf2p2(b1, c_inv)
-                ^ aes_mul_gf2p2(b1, r_sq) ^ aes_mul_gf2p2(q1, c_inv) ^ aes_mul_gf2p2(q1, r_sq);
-    //
-
-    // Note: b_inv is masked by t, b was masked by q.
-    b_inv = {b1_inv, b0_inv};
-
-    return b_inv;
-  endfunction
-
-  // Masked inverse in GF(2^8), using normal basis [y^16, y]
-  // (see Formulas 3, 12, 18 and 19 in the paper)
-  function automatic logic [7:0] aes_masked_inverse_gf2p8(logic [7:0] a,
-                                                          logic [7:0] m,
-                                                          logic [7:0] n);
-    logic [7:0] a_inv;
-    logic [3:0] a1, a0, m1, m0, b, b_inv, q, s1, s0, t, a1_inv, a0_inv;
-    logic [1:0] r;
-    a1 = a[7:4];
-    a0 = a[3:0];
-    m1 = m[7:4];
-    m0 = m[3:0];
-
-    // q must be independent of m.
-    q = n[7:4];
-
-    // Formula 12
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    b = q ^ aes_square_scale_gf2p4_gf2p2(a1 ^ a0)
-          ^ aes_square_scale_gf2p4_gf2p2(m1 ^ m0)
-          ^ aes_mul_gf2p4(a1, a0)
-          ^ aes_mul_gf2p4(a1, m0) ^ aes_mul_gf2p4(a0, m1) ^ aes_mul_gf2p4(m1, m0);
-    //
-
-    // r must be independent of q.
-    r = m1[3:2];
-
-    // Note that the paper states the following requirements on t:
-    // - t must be independent of r.
-    // - t1 must be independent of q0, t0 must be independent of q1.
-    // - t must be independent of m (for the final steps involving s)
-    // The paper suggests to use t = q. To select s = n for the output mask (s must be independent
-    // of t = q = n[7:4]), we would need t = m0 or similar (not r, m1[3:2] though), but this would
-    // break the random product distribution of aes_mul_gf2p4(m0, t), or aes_mul_gf2p4(m1, t) below
-    // (see Lemma 2 in the paper). For this reason, we select t = q here and apply a final mask
-    // switch from s = m to n after the inversion.
-    t = q;
-
-    // b is masked by q, b_inv is masked by t.
-    b_inv = aes_masked_inverse_gf2p4(b, q, r, t);
-
-    // Note that the paper states the following requirements on s:
-    // - s must be independent of t
-    // - s1 must be independent of m0, s0 must be independent of m1.
-    // The paper suggests to use s = m (the input mask). To still end up with the specified output
-    // mask n, we will apply a final mask switch after the inversion.
-    s1 = m1;
-    s0 = m0;
-
-    // Formulas 18 and 19
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    a1_inv = s1 ^ aes_mul_gf2p4(a0, b_inv)
-                ^ aes_mul_gf2p4(a0, t) ^ aes_mul_gf2p4(m0, b_inv) ^ aes_mul_gf2p4(m0, t);
-    a0_inv = s0 ^ aes_mul_gf2p4(a1, b_inv)
-                ^ aes_mul_gf2p4(a1, t) ^ aes_mul_gf2p4(m1, b_inv) ^ aes_mul_gf2p4(m1, t);
-    //
-
-    // Note: a_inv is now masked by s = m, a was masked by m.
-    a_inv = {a1_inv, a0_inv};
-
-    // To have a_inv masked by n (the specified output mask), we perform a final mask switch.
-    // IMPORTANT: The following ops must be executed in order (left to right):
-    a_inv = a_inv ^ n ^ m;
-    //
-
-    return a_inv;
-  endfunction
-
   //////////////////////////
   // Masked Canright SBox //
   //////////////////////////
 
-  logic [7:0] data_basis_x, data_inverse;
-  logic [7:0] in_mask_basis_x;
-  logic [7:0] out_mask_basis_x;
+  logic [7:0] in_data_basis_x, out_data_basis_x;
+  logic [7:0] in_mask_basis_x, out_mask_basis_x;
 
   // Convert data to normal basis X.
-  assign data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
-                                             aes_mvm(data_i ^ 8'h63, S2X);
+  assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
+                                                aes_mvm(data_i ^ 8'h63, S2X);
 
   // Convert masks to normal basis X.
   // The addition of constant 8'h63 following the affine transformation is skipped.
@@ -165,10 +269,15 @@
                                                  aes_mvm(out_mask_i, S2X);
 
   // Do the inversion in normal basis X.
-  assign data_inverse = aes_masked_inverse_gf2p8(data_basis_x, in_mask_basis_x, out_mask_basis_x);
+  aes_masked_inverse_gf2p8_noreuse aes_masked_inverse_gf2p8 (
+    .a     ( in_data_basis_x  ), // input
+    .m     ( in_mask_basis_x  ), // input
+    .n     ( out_mask_basis_x ), // input
+    .a_inv ( out_data_basis_x )  // output
+  );
 
   // Convert to basis S or A.
-  assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(data_inverse, X2S) ^ 8'h63) :
-                                       (aes_mvm(data_inverse, X2A));
+  assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
+                                       (aes_mvm(out_data_basis_x, X2A));
 
 endmodule