[aes] Use prim_flop_en/buf primitives inside DOM S-Box

We use such primitives to encapsulate tool-specific synthesis
attributes. When switching to a different synthesis tool, users need
to make sure to provide the proper primitives. Otherwise, the resulting
netlist might be insecure.

Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/ip/aes/rtl/aes_sbox_dom.sv b/hw/ip/aes/rtl/aes_sbox_dom.sv
index 337a290..6843e87 100644
--- a/hw/ip/aes/rtl/aes_sbox_dom.sv
+++ b/hw/ip/aes/rtl/aes_sbox_dom.sv
@@ -24,10 +24,10 @@
 // IMPORTANT NOTE:                                                                               //
 //                            DO NOT USE THIS FOR SYNTHESIS BLINDLY!                             //
 //                                                                                               //
-// This implementation targets primarily Xilinx Vivado synthesis as well as RTL simulation. It   //
-// contains synthesis attributes specific to Xilinx Vivado to prevent the synthesis tool from    //
-// optimizing away registers and to enforce the correct ordering of operations. Other synthesis  //
-// tools might still heavily optimize the design. The result is likely insecure. Use with care.  //
+// This implementation relies on primitive cells like prim_buf/flop_en containing tool-specific  //
+// synthesis attributes to prevent the synthesis tool from optimizing away/re-ordering registers //
+// and to enforce the correct ordering of operations. Without the proper primitives, synthesis   //
+// tools might heavily optimize the design. The result is likely insecure. Use with care.        //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 `include "prim_assert.sv"
@@ -68,7 +68,7 @@
   // Calculation //
   /////////////////
   // Inner-domain terms
-  (* keep = "true" *) logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
+  logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
   if (NPower == 4) begin : gen_inner_mul_gf2p4
     assign mul_ax_ay_d = aes_mul_gf2p4(a_x, a_y);
     assign mul_bx_by_d = aes_mul_gf2p4(b_x, b_y);
@@ -94,20 +94,21 @@
   ///////////////
   // Resharing of cross-domain terms
   logic [NPower-1:0] aq_z0_d, bq_z0_d;
-  (* keep = "true" *) logic [NPower-1:0] aq_z0_q, bq_z0_q;
+  logic [NPower-1:0] aq_z0_q, bq_z0_q;
   assign aq_z0_d = z_0 ^ mul_ax_by;
   assign bq_z0_d = z_0 ^ mul_ay_bx;
 
   // Registers
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      aq_z0_q <= '0;
-      bq_z0_q <= '0;
-    end else if (we_i) begin
-      aq_z0_q <= aq_z0_d;
-      bq_z0_q <= bq_z0_d;
-    end
-  end
+  prim_flop_en #(
+    .Width      ( 2*NPower ),
+    .ResetValue ( '0       )
+  ) u_prim_flop_abq_z0 (
+    .clk_i  ( clk_i              ),
+    .rst_ni ( rst_ni             ),
+    .en_i   ( we_i               ),
+    .d_i    ( {aq_z0_d, bq_z0_d} ),
+    .q_o    ( {aq_z0_q, bq_z0_q} )
+  );
 
   /////////////////////////
   // Optional Pipelining //
@@ -119,16 +120,17 @@
     // input data every clock cycle and prevents SCA leakage occurring due to the integration of
     // reshared cross-domain terms with inner-domain terms derived from different input data.
 
-    (* keep = "true" *) logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
-    always_ff @(posedge clk_i or negedge rst_ni) begin
-      if (!rst_ni) begin
-        mul_ax_ay_q <= '0;
-        mul_bx_by_q <= '0;
-      end else if (we_i) begin
-        mul_ax_ay_q <= mul_ax_ay_d;
-        mul_bx_by_q <= mul_bx_by_d;
-      end
-    end
+    logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
+    prim_flop_en #(
+      .Width      ( 2*NPower ),
+      .ResetValue ( '0       )
+    ) u_prim_flop_mul_abx_aby (
+      .clk_i  ( clk_i                      ),
+      .rst_ni ( rst_ni                     ),
+      .en_i   ( we_i                       ),
+      .d_i    ( {mul_ax_ay_d, mul_bx_by_d} ),
+      .q_o    ( {mul_ax_ay_q, mul_bx_by_q} )
+    );
 
     assign mul_ax_ay = mul_ax_ay_q;
     assign mul_bx_by = mul_bx_by_q;
@@ -139,8 +141,17 @@
     // this can cause SCA leakage as during the clock cycle in which new data arrives, the new
     // inner-domain terms are integrated with the previous, reshared cross-domain terms.
 
-    assign mul_ax_ay = mul_ax_ay_d;
-    assign mul_bx_by = mul_bx_by_d;
+    // Avoid aggressive synthesis optimizations.
+    logic [NPower-1:0] mul_ax_ay_buf, mul_bx_by_buf;
+    prim_buf #(
+      .Width  ( 2*NPower )
+    ) u_prim_buf_mul_abx_aby (
+      .in_i  ( {mul_ax_ay_d,   mul_bx_by_d}   ),
+      .out_o ( {mul_ax_ay_buf, mul_bx_by_buf} )
+    );
+
+    assign mul_ax_ay = mul_ax_ay_buf;
+    assign mul_bx_by = mul_bx_by_buf;
   end
 
   /////////////////
@@ -185,20 +196,21 @@
   //////////////
   // Blinding of y by z.
   logic [NPower-1:0] a_yz_d, b_yz_d;
-  (* keep = "true" *) logic [NPower-1:0] a_yz_q, b_yz_q;
+  logic [NPower-1:0] a_yz_q, b_yz_q;
   assign a_yz_d = a_y ^ a_z;
   assign b_yz_d = b_y ^ b_z;
 
   // Registers
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      a_yz_q <= '0;
-      b_yz_q <= '0;
-    end else if (we_i) begin
-      a_yz_q <= a_yz_d;
-      b_yz_q <= b_yz_d;
-    end
-  end
+  prim_flop_en #(
+    .Width      ( 2*NPower ),
+    .ResetValue ( '0       )
+  ) u_prim_flop_ab_yz (
+    .clk_i  ( clk_i            ),
+    .rst_ni ( rst_ni           ),
+    .en_i   ( we_i             ),
+    .d_i    ( {a_yz_d, b_yz_d} ),
+    .q_o    ( {a_yz_q, b_yz_q} )
+  );
 
   ////////////////
   // Correction //
@@ -230,16 +242,17 @@
     // and prevents SCA leakage occurring due to the multiplication of input x with b belonging to
     // different clock cycles.
 
-    (* keep = "true" *) logic [NPower-1:0] a_x_q, b_x_q;
-    always_ff @(posedge clk_i or negedge rst_ni) begin
-      if (!rst_ni) begin
-        a_x_q <= '0;
-        b_x_q <= '0;
-      end else if (we_i) begin
-        a_x_q <= a_x;
-        b_x_q <= b_x;
-      end
-    end
+    logic [NPower-1:0] a_x_q, b_x_q;
+    prim_flop_en #(
+      .Width      ( 2*NPower ),
+      .ResetValue ( '0       )
+    ) u_prim_flop_ab_x (
+      .clk_i  ( clk_i          ),
+      .rst_ni ( rst_ni         ),
+      .en_i   ( we_i           ),
+      .d_i    ( {a_x,   b_x}   ),
+      .q_o    ( {a_x_q, b_x_q} )
+    );
 
     assign a_x_calc = a_x_q;
     assign b_x_calc = b_x_q;
@@ -318,20 +331,21 @@
   //////////////
   // Blinding of y by z_0.
   logic [NPower-1:0] a_yz0_d, b_yz0_d;
-  (* keep = "true" *) logic [NPower-1:0] a_yz0_q, b_yz0_q;
+  logic [NPower-1:0] a_yz0_q, b_yz0_q;
   assign a_yz0_d = a_y ^ z_0;
   assign b_yz0_d = b_y ^ z_0;
 
   // Registers
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      a_yz0_q <= '0;
-      b_yz0_q <= '0;
-    end else if (we_i) begin
-      a_yz0_q <= a_yz0_d;
-      b_yz0_q <= b_yz0_d;
-    end
-  end
+  prim_flop_en #(
+    .Width      ( 2*NPower ),
+    .ResetValue ( '0       )
+  ) u_prim_flop_ab_yz0 (
+    .clk_i  ( clk_i              ),
+    .rst_ni ( rst_ni             ),
+    .en_i   ( we_i               ),
+    .d_i    ( {a_yz0_d, b_yz0_d} ),
+    .q_o    ( {a_yz0_q, b_yz0_q} )
+  );
 
   ////////////////
   // Correction //
@@ -342,7 +356,7 @@
   // which allows for further optimizations.
 
   // Calculation
-  (* keep = "true" *) logic [NPower-1:0] mul_ax_z0, mul_bx_z0;
+  logic [NPower-1:0] mul_ax_z0, mul_bx_z0;
   if (NPower == 4) begin : gen_corr_mul_gf2p4
     assign mul_ax_z0 = aes_mul_gf2p4(a_x, z_0);
     assign mul_bx_z0 = aes_mul_gf2p4(b_x, z_0);
@@ -352,22 +366,32 @@
     assign mul_bx_z0 = aes_mul_gf2p2(b_x, z_0);
   end
 
+  // Avoid aggressive synthesis optimizations.
+  logic [NPower-1:0] mul_ax_z0_buf, mul_bx_z0_buf;
+  prim_buf #(
+    .Width ( 2*NPower )
+  ) u_prim_buf_mul_abx_z0 (
+    .in_i  ( {mul_ax_z0,     mul_bx_z0}     ),
+    .out_o ( {mul_ax_z0_buf, mul_bx_z0_buf} )
+  );
+
   // Resharing
   logic [NPower-1:0] axz0_z1_d, bxz0_z1_d;
-  (* keep = "true" *) logic [NPower-1:0] axz0_z1_q, bxz0_z1_q;
-  assign axz0_z1_d = mul_ax_z0 ^ z_1;
-  assign bxz0_z1_d = mul_bx_z0 ^ z_1;
+  logic [NPower-1:0] axz0_z1_q, bxz0_z1_q;
+  assign axz0_z1_d = mul_ax_z0_buf ^ z_1;
+  assign bxz0_z1_d = mul_bx_z0_buf ^ z_1;
 
   // Registers
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      axz0_z1_q <= '0;
-      bxz0_z1_q <= '0;
-    end else if (we_i) begin
-      axz0_z1_q <= axz0_z1_d;
-      bxz0_z1_q <= bxz0_z1_d;
-    end
-  end
+  prim_flop_en #(
+    .Width      ( 2*NPower ),
+    .ResetValue ( '0       )
+  ) u_prim_flop_abxz0_z1 (
+    .clk_i  ( clk_i                  ),
+    .rst_ni ( rst_ni                 ),
+    .en_i   ( we_i                   ),
+    .d_i    ( {axz0_z1_d, bxz0_z1_d} ),
+    .q_o    ( {axz0_z1_q, bxz0_z1_q} )
+  );
 
   /////////////////////////
   // Optional Pipelining //
@@ -381,20 +405,17 @@
     //
     // The PreDomIndep variant has the required pipeline registers built in already.
 
-    (* keep = "true" *) logic [NPower-1:0] a_x_q, b_x_q, a_y_q, b_y_q;
-    always_ff @(posedge clk_i or negedge rst_ni) begin
-      if (!rst_ni) begin
-        a_x_q <= '0;
-        b_x_q <= '0;
-        a_y_q <= '0;
-        b_y_q <= '0;
-      end else if (we_i) begin
-        a_x_q <= a_x;
-        b_x_q <= b_x;
-        a_y_q <= a_y;
-        b_y_q <= b_y;
-      end
-    end
+    logic [NPower-1:0] a_x_q, b_x_q, a_y_q, b_y_q;
+    prim_flop_en #(
+      .Width      ( 4*NPower ),
+      .ResetValue ( '0       )
+    ) u_prim_flop_ab_xy (
+      .clk_i  ( clk_i                        ),
+      .rst_ni ( rst_ni                       ),
+      .en_i   ( we_i                         ),
+      .d_i    ( {a_x,   b_x,   a_y,   b_y}   ),
+      .q_o    ( {a_x_q, b_x_q, a_y_q, b_y_q} )
+    );
 
     assign a_x_calc = a_x_q;
     assign b_x_calc = b_x_q;
@@ -434,7 +455,7 @@
 
     // d_y part: Inner-domain terms of x * y
     logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
-    (* keep = "true" *) logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
+    logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
     if (NPower == 4) begin : gen_inner_mul_gf2p4
       assign mul_ax_ay_d = aes_mul_gf2p4(a_x_calc, a_y_calc);
       assign mul_bx_by_d = aes_mul_gf2p4(b_x_calc, b_y_calc);
@@ -445,31 +466,33 @@
     end
 
     // Registers
-    always_ff @(posedge clk_i or negedge rst_ni) begin
-      if (!rst_ni) begin
-        mul_ax_ay_q <= '0;
-        mul_bx_by_q <= '0;
-      end else if (we_i) begin
-        mul_ax_ay_q <= mul_ax_ay_d;
-        mul_bx_by_q <= mul_bx_by_d;
-      end
-    end
+    prim_flop_en #(
+      .Width      ( 2*NPower ),
+      .ResetValue ( '0       )
+    ) u_prim_flop_mul_abx_aby (
+      .clk_i  ( clk_i                      ),
+      .rst_ni ( rst_ni                     ),
+      .en_i   ( we_i                       ),
+      .d_i    ( {mul_ax_ay_d, mul_bx_by_d} ),
+      .q_o    ( {mul_ax_ay_q, mul_bx_by_q} )
+    );
 
     // Input Registers
-    (* keep = "true" *) logic [NPower-1:0] a_x_q, b_x_q;
-    always_ff @(posedge clk_i or negedge rst_ni) begin
-      if (!rst_ni) begin
-        a_x_q <= '0;
-        b_x_q <= '0;
-      end else if (we_i) begin
-        a_x_q <= a_x_calc;
-        b_x_q <= b_x_calc;
-      end
-    end
+    logic [NPower-1:0] a_x_q, b_x_q;
+    prim_flop_en #(
+      .Width      ( 2*NPower ),
+      .ResetValue ( '0       )
+    ) u_prim_flop_ab_xy (
+      .clk_i  ( clk_i                ),
+      .rst_ni ( rst_ni               ),
+      .en_i   ( we_i                 ),
+      .d_i    ( {a_x_calc, b_x_calc} ),
+      .q_o    ( {a_x_q,    b_x_q}    )
+    );
 
     // _D_y_z0 part: Cross-domain terms: d_x * _D_y_z0
     // Need to use registered version of input x.
-    (* keep = "true" *) logic [NPower-1:0] mul_ax_byz0, mul_bx_ayz0;
+    logic [NPower-1:0] mul_ax_byz0, mul_bx_ayz0;
     if (NPower == 4) begin : gen_cross_mul_gf2p4
       assign mul_ax_byz0 = aes_mul_gf2p4(a_x_q, b_yz0_q);
       assign mul_bx_ayz0 = aes_mul_gf2p4(b_x_q, a_yz0_q);
@@ -479,9 +502,18 @@
       assign mul_bx_ayz0 = aes_mul_gf2p2(b_x_q, a_yz0_q);
     end
 
+    // Avoid aggressive synthesis optimizations.
+    logic [NPower-1:0] mul_ax_byz0_buf, mul_bx_ayz0_buf;
+    prim_buf #(
+      .Width ( 2*NPower )
+    ) u_prim_buf_mul_abx_bayz0 (
+      .in_i  ( {mul_ax_byz0,     mul_bx_ayz0}     ),
+      .out_o ( {mul_ax_byz0_buf, mul_bx_ayz0_buf} )
+    );
+
     // Integration
-    assign a_q = axz0_z1_q ^ mul_ax_ay_q ^ mul_ax_byz0;
-    assign b_q = bxz0_z1_q ^ mul_bx_by_q ^ mul_bx_ayz0;
+    assign a_q = axz0_z1_q ^ mul_ax_ay_q ^ mul_ax_byz0_buf;
+    assign b_q = bxz0_z1_q ^ mul_bx_by_q ^ mul_bx_ayz0_buf;
 
   end else begin : gen_not_pre_dom_indep
     // This DOM-dep multiplier is not directly followed by an un-pipelined DOM-indep multiplier. As
@@ -489,23 +521,41 @@
     // with input x which allows saving 2 GF multipliers.
 
     // Sum up d_y and _D_y_z0.
-    (* keep = "true" *) logic [NPower-1:0] a_b, b_b;
+    logic [NPower-1:0] a_b, b_b;
     assign a_b = a_y_calc ^ b_yz0_q;
     assign b_b = b_y_calc ^ a_yz0_q;
 
+    // Avoid aggressive synthesis optimizations.
+    logic [NPower-1:0] a_b_buf, b_b_buf;
+    prim_buf #(
+      .Width ( 2*NPower )
+    ) u_prim_buf_ab_b (
+      .in_i  ( {a_b,     b_b}     ),
+      .out_o ( {a_b_buf, b_b_buf} )
+    );
+
     // GF multiplications
-    (* keep = "true" *) logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
+    logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
     if (NPower == 4) begin : gen_mul_gf2p4
-      assign a_mul_ax_b = aes_mul_gf2p4(a_x_calc, a_b);
-      assign b_mul_bx_b = aes_mul_gf2p4(b_x_calc, b_b);
+      assign a_mul_ax_b = aes_mul_gf2p4(a_x_calc, a_b_buf);
+      assign b_mul_bx_b = aes_mul_gf2p4(b_x_calc, b_b_buf);
     end else begin : gen_mul_gf2p2
-      assign a_mul_ax_b = aes_mul_gf2p2(a_x_calc, a_b);
-      assign b_mul_bx_b = aes_mul_gf2p2(b_x_calc, b_b);
+      assign a_mul_ax_b = aes_mul_gf2p2(a_x_calc, a_b_buf);
+      assign b_mul_bx_b = aes_mul_gf2p2(b_x_calc, b_b_buf);
     end
 
+    // Avoid aggressive synthesis optimizations.
+    logic [NPower-1:0] a_mul_ax_b_buf, b_mul_bx_b_buf;
+    prim_buf #(
+      .Width ( 2*NPower )
+    ) u_prim_buf_ab_mul_abx_b (
+      .in_i  ( {a_mul_ax_b,     b_mul_bx_b}     ),
+      .out_o ( {a_mul_ax_b_buf, b_mul_bx_b_buf} )
+    );
+
     // Integration
-    assign a_q = axz0_z1_q ^ a_mul_ax_b;
-    assign b_q = bxz0_z1_q ^ b_mul_bx_b;
+    assign a_q = axz0_z1_q ^ a_mul_ax_b_buf;
+    assign b_q = bxz0_z1_q ^ b_mul_bx_b_buf;
   end
 
   // Only GF(2^4) and GF(2^2) is supported.
@@ -541,18 +591,19 @@
   assign b_gamma0 = b_gamma[1:0];
 
   logic [1:0] a_gamma_ss_d, b_gamma_ss_d;
-  (* keep = "true" *) logic [1:0] a_gamma_ss_q, b_gamma_ss_q;
+  logic [1:0] a_gamma_ss_q, b_gamma_ss_q;
   assign a_gamma_ss_d = aes_scale_omega2_gf2p2(aes_square_gf2p2(a_gamma1 ^ a_gamma0));
   assign b_gamma_ss_d = aes_scale_omega2_gf2p2(aes_square_gf2p2(b_gamma1 ^ b_gamma0));
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      a_gamma_ss_q <= '0;
-      b_gamma_ss_q <= '0;
-    end else if (we_i[0]) begin
-      a_gamma_ss_q <= a_gamma_ss_d;
-      b_gamma_ss_q <= b_gamma_ss_d;
-    end
-  end
+  prim_flop_en #(
+    .Width      ( 4  ),
+    .ResetValue ( '0 )
+  ) u_prim_flop_ab_gamma_ss (
+    .clk_i  ( clk_i                        ),
+    .rst_ni ( rst_ni                       ),
+    .en_i   ( we_i[0]                      ),
+    .d_i    ( {a_gamma_ss_d, b_gamma_ss_d} ),
+    .q_o    ( {a_gamma_ss_q, b_gamma_ss_q} )
+  );
 
   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 2    ),
@@ -577,26 +628,32 @@
   /////////////
 
   // Formulas 14 and 15 in [2].
-  (* keep = "true" *) logic [1:0] a_omega, b_omega;
+  logic [1:0] a_omega, b_omega;
   assign a_omega = aes_square_gf2p2(a_gamma1_gamma0 ^ a_gamma_ss_q);
   assign b_omega = aes_square_gf2p2(b_gamma1_gamma0 ^ b_gamma_ss_q);
 
+  // Avoid aggressive synthesis optimizations.
+  logic [1:0] a_omega_buf, b_omega_buf;
+  prim_buf #(
+    .Width ( 4 )
+  ) u_prim_buf_ab_omega (
+    .in_i  ( {a_omega,     b_omega}     ),
+    .out_o ( {a_omega_buf, b_omega_buf} )
+  );
+
   // Formulas 16 and 17 in [2].
 
-  (* keep = "true" *) logic [1:0] a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q;
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      a_gamma1_q <= '0;
-      a_gamma0_q <= '0;
-      b_gamma1_q <= '0;
-      b_gamma0_q <= '0;
-    end else if (we_i[0]) begin
-      a_gamma1_q <= a_gamma1;
-      a_gamma0_q <= a_gamma0;
-      b_gamma1_q <= b_gamma1;
-      b_gamma0_q <= b_gamma0;
-    end
-  end
+  logic [1:0] a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q;
+  prim_flop_en #(
+    .Width      ( 8  ),
+    .ResetValue ( '0 )
+  ) u_prim_flop_ab_gamma10 (
+    .clk_i  ( clk_i                                            ),
+    .rst_ni ( rst_ni                                           ),
+    .en_i   ( we_i[0]                                          ),
+    .d_i    ( {a_gamma1,   a_gamma0,   b_gamma1,   b_gamma0}   ),
+    .q_o    ( {a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q} )
+  );
 
   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 2    ),
@@ -607,9 +664,9 @@
     .rst_ni ( rst_ni           ),
     .we_i   ( we_i[1]          ),
     .a_x    ( a_gamma1_q       ), // Share a of x
-    .a_y    ( a_omega          ), // Share a of y
+    .a_y    ( a_omega_buf      ), // Share a of y
     .b_x    ( b_gamma1_q       ), // Share b of x
-    .b_y    ( b_omega          ), // Share b of y
+    .b_y    ( b_omega_buf      ), // Share b of y
     .z_0    ( prd_3[5:4]       ), // Randomness for blinding
     .z_1    ( prd_3[7:6]       ), // Randomness for resharing
     .a_q    ( a_gamma_inv[1:0] ), // Share a of q
@@ -624,9 +681,9 @@
     .clk_i  ( clk_i            ),
     .rst_ni ( rst_ni           ),
     .we_i   ( we_i[1]          ),
-    .a_x    ( a_omega          ), // Share a of x
+    .a_x    ( a_omega_buf      ), // Share a of x
     .a_y    ( a_gamma0_q       ), // Share a of y
-    .b_x    ( b_omega          ), // Share b of x
+    .b_x    ( b_omega_buf      ), // Share b of x
     .b_y    ( b_gamma0_q       ), // Share b of y
     .z_0    ( prd_3[1:0]       ), // Randomness for blinding
     .z_1    ( prd_3[3:2]       ), // Randomness for resharing
@@ -657,25 +714,25 @@
   // Formula 12 in [2].
 
   logic [3:0] a_y1, a_y0, b_y1, b_y0, a_y1_y0, b_y1_y0;
-  (* keep = "true" *) logic [3:0] a_gamma, b_gamma;
   assign a_y1 = a_y[7:4];
   assign a_y0 = a_y[3:0];
   assign b_y1 = b_y[7:4];
   assign b_y0 = b_y[3:0];
 
   logic [3:0] a_y_ss_d, b_y_ss_d;
-  (* keep = "true" *) logic [3:0] a_y_ss_q, b_y_ss_q;
+  logic [3:0] a_y_ss_q, b_y_ss_q;
   assign a_y_ss_d = aes_square_scale_gf2p4_gf2p2(a_y1 ^ a_y0);
   assign b_y_ss_d = aes_square_scale_gf2p4_gf2p2(b_y1 ^ b_y0);
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      a_y_ss_q <= '0;
-      b_y_ss_q <= '0;
-    end else if (we_i[0]) begin
-      a_y_ss_q <= a_y_ss_d;
-      b_y_ss_q <= b_y_ss_d;
-    end
-  end
+  prim_flop_en #(
+    .Width      ( 8  ),
+    .ResetValue ( '0 )
+  ) u_prim_flop_ab_y_ss (
+    .clk_i  ( clk_i                ),
+    .rst_ni ( rst_ni               ),
+    .en_i   ( we_i[0]              ),
+    .d_i    ( {a_y_ss_d, b_y_ss_d} ),
+    .q_o    ( {a_y_ss_q, b_y_ss_q} )
+  );
 
   aes_dom_dep_mul_gf2pn #(
     .NPower      ( 4    ),
@@ -695,9 +752,19 @@
     .b_q    ( b_y1_y0        )  // Share b of q
   );
 
+  logic [3:0] a_gamma, b_gamma;
   assign a_gamma = a_y_ss_q ^ a_y1_y0;
   assign b_gamma = b_y_ss_q ^ b_y1_y0;
 
+  // Avoid aggressive synthesis optimizations.
+  logic [3:0] a_gamma_buf, b_gamma_buf;
+  prim_buf #(
+    .Width ( 8 )
+  ) u_prim_buf_ab_gamma (
+    .in_i  ( {a_gamma,     b_gamma}     ),
+    .out_o ( {a_gamma_buf, b_gamma_buf} )
+  );
+
   ////////////////////
   // Stages 2 and 3 //
   ////////////////////
@@ -706,15 +773,15 @@
 
   // a_gamma is masked by b_gamma, a_gamma_inv is masked by b_gamma_inv.
   aes_dom_inverse_gf2p4 u_aes_dom_inverse_gf2p4 (
-    .clk_i       ( clk_i     ),
-    .rst_ni      ( rst_ni    ),
-    .we_i        ( we_i[2:1] ),
-    .a_gamma     ( a_gamma   ),
-    .b_gamma     ( b_gamma   ),
-    .prd_2       ( prd.prd_2 ),
-    .prd_3       ( prd.prd_3 ),
-    .a_gamma_inv ( a_theta   ),
-    .b_gamma_inv ( b_theta   )
+    .clk_i       ( clk_i       ),
+    .rst_ni      ( rst_ni      ),
+    .we_i        ( we_i[2:1]   ),
+    .a_gamma     ( a_gamma_buf ),
+    .b_gamma     ( b_gamma_buf ),
+    .prd_2       ( prd.prd_2   ),
+    .prd_3       ( prd.prd_3   ),
+    .a_gamma_inv ( a_theta     ),
+    .b_gamma_inv ( b_theta     )
   );
 
   /////////////
@@ -722,20 +789,17 @@
   /////////////
   // Formulas 18 and 19 in [2].
 
-  (* keep = "true" *) logic [3:0] a_y1_q, a_y0_q, b_y1_q, b_y0_q;
-  always_ff @(posedge clk_i or negedge rst_ni) begin
-    if (!rst_ni) begin
-      a_y1_q <= '0;
-      a_y0_q <= '0;
-      b_y1_q <= '0;
-      b_y0_q <= '0;
-    end else if (we_i[2]) begin
-      a_y1_q <= a_y1;
-      a_y0_q <= a_y0;
-      b_y1_q <= b_y1;
-      b_y0_q <= b_y0;
-    end
-  end
+  logic [3:0] a_y1_q, a_y0_q, b_y1_q, b_y0_q;
+  prim_flop_en #(
+    .Width      ( 16 ),
+    .ResetValue ( '0 )
+  ) u_prim_flop_ab_y10 (
+    .clk_i  ( clk_i                            ),
+    .rst_ni ( rst_ni                           ),
+    .en_i   ( we_i[2]                          ),
+    .d_i    ( {a_y1,   a_y0,   b_y1,   b_y0}   ),
+    .q_o    ( {a_y1_q, a_y0_q, b_y1_q, b_y0_q} )
+  );
 
   aes_dom_indep_mul_gf2pn #(
     .NPower   ( 4    ),