[aes] Add DOM S-Box

This commit adds an S-Box implementation that uses domain-oriented masking.

Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/ip/aes/aes.core b/hw/ip/aes/aes.core
index e12c8ef..437aee3 100644
--- a/hw/ip/aes/aes.core
+++ b/hw/ip/aes/aes.core
@@ -28,6 +28,7 @@
       - rtl/aes_sbox_canright.sv
       - rtl/aes_sbox_canright_masked_noreuse.sv
       - rtl/aes_sbox_canright_masked.sv
+      - rtl/aes_sbox_dom.sv
       - rtl/aes_shift_rows.sv
       - rtl/aes_mix_columns.sv
       - rtl/aes_mix_single_column.sv
diff --git a/hw/ip/aes/lint/aes.vlt b/hw/ip/aes/lint/aes.vlt
index 004c2ba..660f05e 100644
--- a/hw/ip/aes/lint/aes.vlt
+++ b/hw/ip/aes/lint/aes.vlt
@@ -12,3 +12,4 @@
 
 // Masked SBox implementations may require multiple modules to prevent aggressive synthesis optimizations.
 lint_off -rule DECLFILENAME -file "*/rtl/aes_sbox_*_masked*.sv" -match "Filename 'aes_sbox_*_masked*' does not match MODULE name: *"
+lint_off -rule DECLFILENAME -file "*/rtl/aes_sbox_dom*.sv" -match "Filename 'aes_sbox_dom*' does not match MODULE name: *"
diff --git a/hw/ip/aes/pre_dv/aes_sbox_lec/aes_sbox_lec.py b/hw/ip/aes/pre_dv/aes_sbox_lec/aes_sbox_lec.py
index be99494..016ab50 100755
--- a/hw/ip/aes/pre_dv/aes_sbox_lec/aes_sbox_lec.py
+++ b/hw/ip/aes/pre_dv/aes_sbox_lec/aes_sbox_lec.py
@@ -33,6 +33,9 @@
 impl_list = [
     impl_dut.replace(rtl_path, '').replace('.sv', '') for impl_dut in impl_list
 ]
+# Remove multicycle implementations, we can't perform LEC for those.
+impl_list.remove('aes_sbox_dom')
+# Remove reference implementation and package files.
 impl_list.remove(impl_gold)
 impl_list.remove(file_pkg_canright)
 file_pkg_canright = file_pkg_canright + '.sv'
diff --git a/hw/ip/aes/pre_dv/aes_sbox_tb/rtl/aes_sbox_tb.sv b/hw/ip/aes/pre_dv/aes_sbox_tb/rtl/aes_sbox_tb.sv
index b996cf6..5ab986a 100644
--- a/hw/ip/aes/pre_dv/aes_sbox_tb/rtl/aes_sbox_tb.sv
+++ b/hw/ip/aes/pre_dv/aes_sbox_tb/rtl/aes_sbox_tb.sv
@@ -20,7 +20,7 @@
   ciph_op_e   op;
 
   localparam int NUM_SBOX_IMPLS = 2;
-  localparam int NUM_SBOX_IMPLS_MASKED = 2;
+  localparam int NUM_SBOX_IMPLS_MASKED = 3;
   localparam int NumSBoxImplsTotal = NUM_SBOX_IMPLS + NUM_SBOX_IMPLS_MASKED;
   logic [7:0] responses[NumSBoxImplsTotal];
 
@@ -29,7 +29,7 @@
   always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
     if (!rst_ni) begin
       count_q <= '0;
-    end else begin
+    end else if (dom_done) begin
       count_q <= count_d;
     end
   end
@@ -53,27 +53,28 @@
   // Mask Generation
   parameter int unsigned WidthPRDSBoxCanrightMasked        = 8;
   parameter int unsigned WidthPRDSBoxCanrightMaskedNoreuse = 18;
+  parameter int unsigned WidthPRDSBoxDOM                   = 28;
 
-  logic              [7:0] masked_stimulus;
-  logic              [7:0] in_mask;
+  logic                      [7:0] masked_stimulus;
+  logic                      [7:0] in_mask;
 
-  logic              [7:0] masked_response [NUM_SBOX_IMPLS_MASKED];
-  logic              [7:0] out_mask [NUM_SBOX_IMPLS_MASKED];
+  logic                      [7:0] masked_response [NUM_SBOX_IMPLS_MASKED];
+  logic                      [7:0] out_mask [NUM_SBOX_IMPLS_MASKED];
 
-  logic                                       [31:0] tmp;
-  logic [31-(WidthPRDSBoxCanrightMaskedNoreuse+8):0] unused_tmp;
-  logic      [WidthPRDSBoxCanrightMaskedNoreuse-1:0] prd_masking;
+  logic                     [63:0] tmp;
+  logic [63-(WidthPRDSBoxDOM+8):0] unused_tmp;
+  logic      [WidthPRDSBoxDOM-1:0] prd_masking;
 
   always_ff @(posedge clk_i or negedge rst_ni) begin : reg_tmp
     if (!rst_ni) begin
-      tmp <= 32'hAAAFF;
-    end else begin
-      tmp <= $random;
+      tmp <= 64'hAAAFF;
+    end else if (dom_done) begin
+      tmp <= {$random, $random};
     end
   end
   assign in_mask     = tmp[7:0];
-  assign prd_masking = tmp[8 +: WidthPRDSBoxCanrightMaskedNoreuse];
-  assign unused_tmp  = tmp[31:WidthPRDSBoxCanrightMaskedNoreuse+8];
+  assign prd_masking = tmp[8 +: WidthPRDSBoxDOM];
+  assign unused_tmp  = tmp[63:WidthPRDSBoxDOM+8];
 
   assign masked_stimulus = stimulus ^ in_mask;
 
@@ -96,6 +97,22 @@
     .mask_o ( out_mask[1]                                 )
   );
 
+  // Instantiate DOM SBox Implementation
+  logic dom_done;
+  aes_sbox_dom aes_sbox_dom (
+    .clk_i     ( clk_i                            ),
+    .rst_ni    ( rst_ni                           ),
+    .en_i      ( 1'b1                             ),
+    .out_req_o ( dom_done                         ),
+    .out_ack_i ( 1'b1                             ),
+    .op_i      ( op                               ),
+    .data_i    ( masked_stimulus                  ),
+    .mask_i    ( in_mask                          ),
+    .prd_i     ( prd_masking[WidthPRDSBoxDOM-1:0] ),
+    .data_o    ( masked_response[2]               ),
+    .mask_o    ( out_mask[2]                      )
+  );
+
   // Unmask responses
   always_comb begin : unmask_resp
     for (int i=0; i<NUM_SBOX_IMPLS_MASKED; i++) begin
@@ -109,7 +126,7 @@
     test_passed_o <= 1'b1;
 
     for (int i=1; i<NumSBoxImplsTotal; i++) begin
-      if (rst_ni && (responses[i] != responses[0])) begin
+      if (rst_ni && dom_done && (responses[i] != responses[0])) begin
         $display("\nERROR: Mismatch between LUT-based S-Box and Implementation %0d found.", i);
         $display("op = %s, stimulus = 8'h%h, expected resp = 8'h%h, actual resp = 8'h%h\n",
             (op == CIPH_FWD) ? "CIPH_FWD" : "CIPH_INV", stimulus, responses[0], responses[i]);
diff --git a/hw/ip/aes/rtl/aes_cipher_core.sv b/hw/ip/aes/rtl/aes_cipher_core.sv
index 00c4050..3711140 100644
--- a/hw/ip/aes/rtl/aes_cipher_core.sv
+++ b/hw/ip/aes/rtl/aes_cipher_core.sv
@@ -636,7 +636,8 @@
   `ASSERT_INIT(AesMaskedCoreAndSBox,
       (Masking &&
       (SBoxImpl == SBoxImplCanrightMasked ||
-       SBoxImpl == SBoxImplCanrightMaskedNoreuse)) ||
+       SBoxImpl == SBoxImplCanrightMaskedNoreuse ||
+       SBoxImpl == SBoxImplDom)) ||
       (!Masking &&
       (SBoxImpl == SBoxImplLut ||
        SBoxImpl == SBoxImplCanright)))
diff --git a/hw/ip/aes/rtl/aes_key_expand.sv b/hw/ip/aes/rtl/aes_key_expand.sv
index b326e03..7935aa0 100644
--- a/hw/ip/aes/rtl/aes_key_expand.sv
+++ b/hw/ip/aes/rtl/aes_key_expand.sv
@@ -388,7 +388,8 @@
   `ASSERT_INIT(AesMaskedCoreAndSBox,
       (Masking &&
       (SBoxImpl == SBoxImplCanrightMasked ||
-       SBoxImpl == SBoxImplCanrightMaskedNoreuse)) ||
+       SBoxImpl == SBoxImplCanrightMaskedNoreuse ||
+       SBoxImpl == SBoxImplDom)) ||
       (!Masking &&
       (SBoxImpl == SBoxImplLut ||
        SBoxImpl == SBoxImplCanright)))
diff --git a/hw/ip/aes/rtl/aes_pkg.sv b/hw/ip/aes/rtl/aes_pkg.sv
index d1c743a..b4e5a79 100644
--- a/hw/ip/aes/rtl/aes_pkg.sv
+++ b/hw/ip/aes/rtl/aes_pkg.sv
@@ -49,12 +49,14 @@
     216'h6587da04c59c02125750f35e7634e08951122874022ce19b143211;
 
 typedef enum integer {
-  SBoxImplLut,                  // Unmasked LUT-based S-Box
-  SBoxImplCanright,             // Unmasked Canright S-Box, see aes_sbox_canright.sv
-  SBoxImplCanrightMasked,       // First-order masked Canright S-Box
-                                // see aes_sbox_canright_masked.sv
-  SBoxImplCanrightMaskedNoreuse // First-order masked Canright S-Box without mask reuse,
-                                // see aes_sbox_canright_masked_noreuse.sv
+  SBoxImplLut,                   // Unmasked LUT-based S-Box
+  SBoxImplCanright,              // Unmasked Canright S-Box, see aes_sbox_canright.sv
+  SBoxImplCanrightMasked,        // First-order masked Canright S-Box
+                                 // see aes_sbox_canright_masked.sv
+  SBoxImplCanrightMaskedNoreuse, // First-order masked Canright S-Box without mask reuse,
+                                 // see aes_sbox_canright_masked_noreuse.sv
+  SBoxImplDom                    // First-order masked S-Box using domain-oriented masking,
+                                 // see aes_sbox_canright_dom.sv
 } sbox_impl_e;
 
 typedef enum logic {
diff --git a/hw/ip/aes/rtl/aes_sbox.sv b/hw/ip/aes/rtl/aes_sbox.sv
index b3dc17b..ac0e2cd 100644
--- a/hw/ip/aes/rtl/aes_sbox.sv
+++ b/hw/ip/aes/rtl/aes_sbox.sv
@@ -23,9 +23,10 @@
 
   import aes_pkg::*;
   localparam bit SBoxMasked = (SBoxImpl == SBoxImplCanrightMasked ||
-                               SBoxImpl == SBoxImplCanrightMaskedNoreuse) ? 1'b1 : 1'b0;
+                               SBoxImpl == SBoxImplCanrightMaskedNoreuse ||
+                               SBoxImpl == SBoxImplDom) ? 1'b1 : 1'b0;
 
-  localparam bit SBoxSingleCycle = 1'b1;
+  localparam bit SBoxSingleCycle = (SBoxImpl == SBoxImplDom) ? 1'b0 : 1'b1;
 
   if (!SBoxMasked) begin : gen_sbox_unmasked
     // Tie off unused inputs.
@@ -57,7 +58,22 @@
 
   end else begin : gen_sbox_masked
 
-    if (SBoxImpl == SBoxImplCanrightMaskedNoreuse) begin : gen_sbox_canright_masked_noreuse
+    if (SBoxImpl == SBoxImplDom) begin : gen_sbox_dom
+      aes_sbox_dom u_aes_sbox (
+        .clk_i      ( clk_i       ),
+        .rst_ni     ( rst_ni      ),
+        .en_i       ( en_i        ),
+        .out_req_o  ( out_req_o   ),
+        .out_ack_i  ( out_ack_i   ),
+        .op_i       ( op_i        ),
+        .data_i     ( data_i      ),
+        .mask_i     ( mask_i      ),
+        .prd_i      ( prd_i[27:0] ),
+        .data_o     ( data_o      ),
+        .mask_o     ( mask_o      )
+      );
+
+    end else if (SBoxImpl == SBoxImplCanrightMaskedNoreuse) begin : gen_sbox_canright_masked_noreuse
       // Tie off unused inputs.
       logic unused_clk;
       logic unused_rst;
@@ -107,25 +123,6 @@
 
     // Signal that we have valid output right away.
     assign out_req_o = en_i;
-  end else begin : gen_req_multicycle
-
-    // All currently implemented S-Boxes allow for single-cycle operation. Future S-Box
-    // implementations may require multiple clock cycles. The counter below is for mimicking such
-    // implementations. It's for testing purposes only.
-
-    // Counter register
-    logic [2:0] count_d, count_q;
-    assign count_d = (out_req_o && out_ack_i) ? '0                :
-                     out_req_o                ? count_q           :
-                     en_i                     ? count_q + 3'b001 : count_q;
-    always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
-      if (!rst_ni) begin
-        count_q <= '0;
-      end else begin
-        count_q <= count_d;
-      end
-    end
-    assign out_req_o = en_i & count_q == 3'b111;
   end
 
 endmodule
diff --git a/hw/ip/aes/rtl/aes_sbox_dom.sv b/hw/ip/aes/rtl/aes_sbox_dom.sv
new file mode 100644
index 0000000..2f4fdec
--- /dev/null
+++ b/hw/ip/aes/rtl/aes_sbox_dom.sv
@@ -0,0 +1,657 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AES S-Box with First-Order Domain-Oriented Masking
+//
+// This is the unpipelined version using DOM-dep multipliers. It has a latency of 5 clock cycles
+// and requires 28 bits of pseudo-random data per evaluation. Pipelining would only be beneficial
+// when using
+// - either a cipher core architecture with a data path smaller than 128 bit, i.e., where the
+//   individual S-Boxes are evaluated more than once per round, or
+// - a fully unrolled cipher core architecture for maximum throughput.
+//
+// Note: The DOM AES S-Box is built on top of the Canright masked S-Box without mask re-use.
+//
+// For details, see the following papers and reports:
+// [1] Gross, "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary
+//     Protection Order" available at https://eprint.iacr.org/2016/486.pdf
+// [2] Canright, "A very compact 'perfectly masked' S-box for AES (corrected)" available at
+//     https://eprint.iacr.org/2009/011.pdf
+// [3] Canright, "A very compact Rijndael S-box" available at https://hdl.handle.net/10945/25608
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// IMPORTANT NOTE:                                                                               //
+//                            DO NOT USE THIS FOR SYNTHESIS BLINDLY!                             //
+//                                                                                               //
+// This implementation targets primarily Xilinx Vivado synthesis as well as RTL simulation. It   //
+// contains synthesis attributes specific to Xilinx Vivado to prevent the synthesis tool from    //
+// optimizing away registers and to enforce the correct ordering of operations. Other synthesis  //
+// tools might still heavily optimize the design. The result is likely insecure. Use with care.  //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "prim_assert.sv"
+
+// DOM-indep GF(2^N) multiplier, unpipelined, first-order masked.
+// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
+// domain-oriented masking. The sharings of x and y are required to be uniformly random and
+// independent from each other.
+// See Fig. 2 in [1].
+module aes_dom_indep_mul_gf2pn #(
+  parameter int unsigned NPower = 4
+) (
+  input  logic              clk_i,
+  input  logic              rst_ni,
+  input  logic [NPower-1:0] a_x,    // Share a of x
+  input  logic [NPower-1:0] a_y,    // Share a of y
+  input  logic [NPower-1:0] b_x,    // Share b of x
+  input  logic [NPower-1:0] b_y,    // Share b of y
+  input  logic [NPower-1:0] z_0,    // Randomness for resharing
+  output logic [NPower-1:0] a_q,    // Share a of q
+  output logic [NPower-1:0] b_q     // Share b of q
+);
+
+  import aes_sbox_canright_pkg::*;
+
+  /////////////////
+  // Calculation //
+  /////////////////
+  // Inner-domain terms
+  (* keep = "true" *) logic [NPower-1:0] mul_ax_ay, mul_bx_by;
+  if (NPower == 4) begin : gen_inner_mul_gf2p4
+    assign mul_ax_ay = aes_mul_gf2p4(a_x, a_y);
+    assign mul_bx_by = aes_mul_gf2p4(b_x, b_y);
+
+  end else begin : gen_inner_mul_gf2p2
+    assign mul_ax_ay = aes_mul_gf2p2(a_x, a_y);
+    assign mul_bx_by = aes_mul_gf2p2(b_x, b_y);
+  end
+
+  // Cross-domain terms
+  logic [NPower-1:0] mul_ax_by, mul_ay_bx;
+  if (NPower == 4) begin : gen_cross_mul_gf2p4
+    assign mul_ax_by = aes_mul_gf2p4(a_x, b_y);
+    assign mul_ay_bx = aes_mul_gf2p4(a_y, b_x);
+
+  end else begin : gen_cross_mul_gf2p2
+    assign mul_ax_by = aes_mul_gf2p2(a_x, b_y);
+    assign mul_ay_bx = aes_mul_gf2p2(a_y, b_x);
+  end
+
+  ///////////////
+  // Resharing //
+  ///////////////
+  // Resharing of cross-domain terms
+  (* keep = "true" *) logic [NPower-1:0] aq_z0_d, aq_z0_q;
+  (* keep = "true" *) logic [NPower-1:0] bq_z0_d, bq_z0_q;
+  assign aq_z0_d = z_0 ^ mul_ax_by;
+  assign bq_z0_d = z_0 ^ mul_ay_bx;
+
+  // Registers
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      aq_z0_q <= '0;
+      bq_z0_q <= '0;
+    end else begin
+      aq_z0_q <= aq_z0_d;
+      bq_z0_q <= bq_z0_d;
+    end
+  end
+
+  /////////////////
+  // Integration //
+  /////////////////
+  assign a_q = mul_ax_ay ^ aq_z0_q;
+  assign b_q = mul_bx_by ^ bq_z0_q;
+
+  // Only GF(2^4) and GF(2^2) is supported.
+  `ASSERT_INIT(AesDomIndepMulPower, NPower == 4 || NPower == 2)
+
+endmodule
+
+// DOM-dep GF(2^N) multiplier, unpipelined, first-order masked.
+// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
+// domain-oriented masking. The sharings of x and y are NOT required to be independent from each
+// other. This is the un-optimized version consuming 3 times N bits of randomness for blinding and
+// resharing. It is not used in the design but we keep it for reference.
+// See Fig. 4 and Formulas 8 - 11 in [1].
+module aes_dom_dep_mul_gf2pn_unopt #(
+  parameter int unsigned NPower = 4
+) (
+  input  logic              clk_i,
+  input  logic              rst_ni,
+  input  logic [NPower-1:0] a_x,    // Share a of x
+  input  logic [NPower-1:0] a_y,    // Share a of y
+  input  logic [NPower-1:0] b_x,    // Share b of x
+  input  logic [NPower-1:0] b_y,    // Share b of y
+  input  logic [NPower-1:0] a_z,    // Randomness for blinding
+  input  logic [NPower-1:0] b_z,    // Randomness for blinding
+  input  logic [NPower-1:0] z_0,    // Randomness for resharing
+  output logic [NPower-1:0] a_q,    // Share a of q
+  output logic [NPower-1:0] b_q     // Share b of q
+);
+
+  import aes_sbox_canright_pkg::*;
+
+  //////////////
+  // Blinding //
+  //////////////
+  // Blinding of y by z.
+  (* keep = "true" *) logic [NPower-1:0] a_yz_d, a_yz_q;
+  (* keep = "true" *) logic [NPower-1:0] b_yz_d, b_yz_q;
+  assign a_yz_d = a_y ^ a_z;
+  assign b_yz_d = b_y ^ b_z;
+
+  // Registers
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      a_yz_q <= '0;
+      b_yz_q <= '0;
+    end else begin
+      a_yz_q <= a_yz_d;
+      b_yz_q <= b_yz_d;
+    end
+  end
+
+  ////////////////
+  // Correction //
+  ////////////////
+  logic [NPower-1:0] a_mul_x_z, b_mul_x_z;
+  aes_dom_indep_mul_gf2pn #(
+    .NPower ( NPower )
+  ) aes_dom_indep_mul_gf2pn (
+    .clk_i  ( clk_i     ),
+    .rst_ni ( rst_ni    ),
+    .a_x    ( a_x       ), // Share a of x
+    .a_y    ( a_z       ), // Share a of z
+    .b_x    ( b_x       ), // Share b of x
+    .b_y    ( b_z       ), // Share b of z
+    .z_0    ( z_0       ), // Randomness for resharing
+    .a_q    ( a_mul_x_z ), // Share a of x * z
+    .b_q    ( b_mul_x_z )  // Share b of x * z
+  );
+
+  /////////////////
+  // Calculation //
+  /////////////////
+  // Combine shares of blinded y to obtain b.
+  logic [NPower-1:0] b;
+  assign b = a_yz_q ^ b_yz_q;
+
+  logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
+  if (NPower == 4) begin : gen_mul_gf2p4
+    assign a_mul_ax_b = aes_mul_gf2p4(a_x, b);
+    assign b_mul_bx_b = aes_mul_gf2p4(b_x, b);
+
+  end else begin : gen_mul_gf2p2
+    assign a_mul_ax_b = aes_mul_gf2p2(a_x, b);
+    assign b_mul_bx_b = aes_mul_gf2p2(b_x, b);
+  end
+
+  /////////////////
+  // Integration //
+  /////////////////
+  assign a_q = a_mul_x_z ^ a_mul_ax_b;
+  assign b_q = b_mul_x_z ^ b_mul_bx_b;
+
+  // Only GF(2^4) and GF(2^2) is supported.
+  `ASSERT_INIT(AesDomDepMulUnoptPower, NPower == 4 || NPower == 2)
+
+endmodule
+
+// DOM-dep GF(2^N) multiplier, unpipelined, first-order masked.
+// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
+// domain-oriented masking. The sharings of x and y are NOT required to be independent from each
+// other. This is the optimized version consuming 2 instead of 3 times N bits of randomness for
+// blinding and resharing.
+// See Formula 12 in [1].
+module aes_dom_dep_mul_gf2pn #(
+  parameter int unsigned NPower      = 4,
+  parameter bit          PreDOMIndep = 1'b0 // 1'b0: Not followed by a DOM-indep multiplier, this
+                                            //       enables additional area optimizations
+                                            // 1'b1: Directly followed by a DOM-indep multiplier,
+                                            //       this is the version discussed in [1].
+) (
+  input  logic              clk_i,
+  input  logic              rst_ni,
+  input  logic [NPower-1:0] a_x,    // Share a of x
+  input  logic [NPower-1:0] a_y,    // Share a of y
+  input  logic [NPower-1:0] b_x,    // Share b of x
+  input  logic [NPower-1:0] b_y,    // Share b of y
+  input  logic [NPower-1:0] z_0,    // Randomness for blinding
+  input  logic [NPower-1:0] z_1,    // Randomness for resharing
+  output logic [NPower-1:0] a_q,    // Share a of q
+  output logic [NPower-1:0] b_q     // Share b of q
+);
+
+  import aes_sbox_canright_pkg::*;
+
+  //////////////
+  // Blinding //
+  //////////////
+  // Blinding of y by z_0.
+  (* keep = "true" *) logic [NPower-1:0] a_yz0_d, a_yz0_q;
+  (* keep = "true" *) logic [NPower-1:0] b_yz0_d, b_yz0_q;
+  assign a_yz0_d = a_y ^ z_0;
+  assign b_yz0_d = b_y ^ z_0;
+
+  // Registers
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      a_yz0_q <= '0;
+      b_yz0_q <= '0;
+    end else begin
+      a_yz0_q <= a_yz0_d;
+      b_yz0_q <= b_yz0_d;
+    end
+  end
+
+  ////////////////
+  // Correction //
+  ////////////////
+  // Basically, this a DOM-indep multiplier with:
+  // - a_x = a_x, b_x = b_x, and
+  // - a_y = z_0, b_y = 0 (constant),
+  // which allows for further optimizations.
+
+  // Calculation
+  (* keep = "true" *) logic [NPower-1:0] mul_ax_z0, mul_bx_z0;
+  if (NPower == 4) begin : gen_corr_mul_gf2p4
+    assign mul_ax_z0 = aes_mul_gf2p4(a_x, z_0);
+    assign mul_bx_z0 = aes_mul_gf2p4(b_x, z_0);
+
+  end else begin : gen_corr_mul_gf2p2
+    assign mul_ax_z0 = aes_mul_gf2p2(a_x, z_0);
+    assign mul_bx_z0 = aes_mul_gf2p2(b_x, z_0);
+  end
+
+  // Resharing
+  (* keep = "true" *) logic [NPower-1:0] axz0_z1_d, axz0_z1_q;
+  (* keep = "true" *) logic [NPower-1:0] bxz0_z1_d, bxz0_z1_q;
+  assign axz0_z1_d = mul_ax_z0 ^ z_1;
+  assign bxz0_z1_d = mul_bx_z0 ^ z_1;
+
+  // Registers
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      axz0_z1_q <= '0;
+      bxz0_z1_q <= '0;
+    end else begin
+      axz0_z1_q <= axz0_z1_d;
+      bxz0_z1_q <= bxz0_z1_d;
+    end
+  end
+
+  ///////////////////////////////
+  // Calculation & Integration //
+  ///////////////////////////////
+  // Compute b. Note that unlike for the unoptimized implementation, we don't combine the blinded
+  // shares of y to obtain a single b value. Intstead, every domain d gets its own version of b:
+  //
+  //   d_b = d_y ^ _D_y_z0
+  //
+  // where _D_y_z0 corresponds to the sum of all domains of y except for domain d, each
+  // individually blinded by z0 (needs to happen before the register bank). This optimization
+  // is only suitable for first-order masking.
+  // See Formula 12 in [1].
+
+  if (PreDOMIndep == 1'b1) begin : gen_pre_dom_indep
+    // This DOM-dep multiplier is directly followed by a DOM-indep multiplier without an additional
+    // pipeline stage in between. To prevent SCA leakage in the DOM-indep multiplier, the d_y and
+    // _D_y_z0 parts of d_b need to be individually multiplied with input x and then the results
+    // need to be integrated (summed up) on a per-domain basis.
+
+    // d_y part: Inner-domain terms of x * y
+    (* keep = "true" *) logic [NPower-1:0] mul_ax_ay_d, mul_ax_ay_q;
+    (* keep = "true" *) logic [NPower-1:0] mul_bx_by_d, mul_bx_by_q;
+    if (NPower == 4) begin : gen_inner_mul_gf2p4
+      assign mul_ax_ay_d = aes_mul_gf2p4(a_x, a_y);
+      assign mul_bx_by_d = aes_mul_gf2p4(b_x, b_y);
+
+    end else begin : gen_inner_mul_gf2p2
+      assign mul_ax_ay_d = aes_mul_gf2p2(a_x, a_y);
+      assign mul_bx_by_d = aes_mul_gf2p2(b_x, b_y);
+    end
+
+    // Registers
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        mul_ax_ay_q <= '0;
+        mul_bx_by_q <= '0;
+      end else begin
+        mul_ax_ay_q <= mul_ax_ay_d;
+        mul_bx_by_q <= mul_bx_by_d;
+      end
+    end
+
+    // Input Registers
+    (* keep = "true" *) logic [NPower-1:0] a_x_q;
+    (* keep = "true" *) logic [NPower-1:0] b_x_q;
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        a_x_q <= '0;
+        b_x_q <= '0;
+      end else begin
+        a_x_q <= a_x;
+        b_x_q <= b_x;
+      end
+    end
+
+    // _D_y_z0 part: Cross-domain terms: d_x * _D_y_z0
+    // Need to use registered version of input x.
+    (* keep = "true" *) logic [NPower-1:0] mul_ax_byz0, mul_bx_ayz0;
+    if (NPower == 4) begin : gen_cross_mul_gf2p4
+      assign mul_ax_byz0 = aes_mul_gf2p4(a_x_q, b_yz0_q);
+      assign mul_bx_ayz0 = aes_mul_gf2p4(b_x_q, a_yz0_q);
+
+    end else begin : gen_cross_mul_gf2p2
+      assign mul_ax_byz0 = aes_mul_gf2p2(a_x_q, b_yz0_q);
+      assign mul_bx_ayz0 = aes_mul_gf2p2(b_x_q, a_yz0_q);
+    end
+
+    // Integration
+    assign a_q = axz0_z1_q ^ mul_ax_ay_q ^ mul_ax_byz0;
+    assign b_q = bxz0_z1_q ^ mul_bx_by_q ^ mul_bx_ayz0;
+
+  end else begin : gen_not_pre_dom_indep
+    // This DOM-dep multiplier is not directly followed by a DOM-indep multiplier. As a result,
+    // the the d_y and _D_y_z0 parts of d_b can be summed up prior to the multiplication with input
+    // x which helps saving 2 GF multipliers and 4 registers (NPower flops each).
+
+    // Sum up d_y and _D_y_z0.
+    (* keep = "true" *) logic [NPower-1:0] a_b, b_b;
+    assign a_b = a_y ^ b_yz0_q;
+    assign b_b = b_y ^ a_yz0_q;
+
+    // GF multiplications
+    (* keep = "true" *) logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
+    if (NPower == 4) begin : gen_mul_gf2p4
+      assign a_mul_ax_b = aes_mul_gf2p4(a_x, a_b);
+      assign b_mul_bx_b = aes_mul_gf2p4(b_x, b_b);
+    end else begin : gen_mul_gf2p2
+      assign a_mul_ax_b = aes_mul_gf2p2(a_x, a_b);
+      assign b_mul_bx_b = aes_mul_gf2p2(b_x, b_b);
+    end
+
+    // Integration
+    assign a_q = axz0_z1_q ^ a_mul_ax_b;
+    assign b_q = bxz0_z1_q ^ b_mul_bx_b;
+  end
+
+  // Only GF(2^4) and GF(2^2) is supported.
+  `ASSERT_INIT(AesDomDepMulPower, NPower == 4 || NPower == 2)
+
+endmodule
+
+// Inverse in GF(2^4) using first-order domain-oriented masking and normal basis [z^4, z].
+// See Fig. 6 in [2] (grey block, Stages 2 and 3) and Formulas 6, 13, 14, 15, 16, 17 in [2].
+module aes_dom_inverse_gf2p4 (
+  input  logic        clk_i,
+  input  logic        rst_ni,
+  input  logic  [3:0] a_gamma,
+  input  logic  [3:0] b_gamma,
+  input  logic [11:0] prd,
+  output logic  [3:0] a_gamma_inv,
+  output logic  [3:0] b_gamma_inv
+);
+
+  import aes_sbox_canright_pkg::*;
+
+  // Distribute the randomness for the various multiplers.
+  logic [3:0] z_2;
+  logic [3:0] z_3_1;
+  logic [3:0] z_3_0;
+  assign z_2   = prd[3:0];
+  assign z_3_0 = prd[7:4];
+  assign z_3_1 = prd[11:8];
+
+  /////////////
+  // Stage 2 //
+  /////////////
+  // Formula 13 in [2].
+
+  logic [1:0] a_gamma1, a_gamma0, b_gamma1, b_gamma0, a_gamma1_gamma0, b_gamma1_gamma0;
+  (* keep = "true" *) logic [1:0] a_gamma_ss, b_gamma_ss;
+  assign a_gamma1 = a_gamma[3:2];
+  assign a_gamma0 = a_gamma[1:0];
+  assign b_gamma1 = b_gamma[3:2];
+  assign b_gamma0 = b_gamma[1:0];
+
+  assign a_gamma_ss = aes_scale_omega2_gf2p2(aes_square_gf2p2(a_gamma1 ^ a_gamma0));
+  assign b_gamma_ss = aes_scale_omega2_gf2p2(aes_square_gf2p2(b_gamma1 ^ b_gamma0));
+
+  aes_dom_dep_mul_gf2pn #(
+    .NPower      ( 2    ),
+    .PreDOMIndep ( 1'b0 )
+  ) aes_dom_mul_gamma1_gamma0 (
+    .clk_i  ( clk_i           ),
+    .rst_ni ( rst_ni          ),
+    .a_x    ( a_gamma1        ), // Share a of x
+    .a_y    ( a_gamma0        ), // Share a of y
+    .b_x    ( b_gamma1        ), // Share b of x
+    .b_y    ( b_gamma0        ), // Share b of y
+    .z_0    ( z_2[1:0]        ), // Randomness for blinding
+    .z_1    ( z_2[3:2]        ), // Randomness for resharing
+    .a_q    ( a_gamma1_gamma0 ), // Share a of q
+    .b_q    ( b_gamma1_gamma0 )  // Share b of q
+  );
+
+  /////////////
+  // Stage 3 //
+  /////////////
+
+  // Formulas 14 and 15 in [2].
+  (* keep = "true" *) logic [1:0] a_omega, b_omega;
+  assign a_omega = aes_square_gf2p2(a_gamma1_gamma0 ^ a_gamma_ss);
+  assign b_omega = aes_square_gf2p2(b_gamma1_gamma0 ^ b_gamma_ss);
+
+  // Formulas 16 and 17 in [2].
+
+  aes_dom_dep_mul_gf2pn #(
+    .NPower      ( 2    ),
+    .PreDOMIndep ( 1'b1 )
+  ) aes_dom_mul_omega_gamma1 (
+    .clk_i  ( clk_i            ),
+    .rst_ni ( rst_ni           ),
+    .a_x    ( a_gamma1         ), // Share a of x
+    .a_y    ( a_omega          ), // Share a of y
+    .b_x    ( b_gamma1         ), // Share b of x
+    .b_y    ( b_omega          ), // Share b of y
+    .z_0    ( z_3_1[1:0]       ), // Randomness for blinding
+    .z_1    ( z_3_1[3:2]       ), // Randomness for resharing
+    .a_q    ( a_gamma_inv[1:0] ), // Share a of q
+    .b_q    ( b_gamma_inv[1:0] )  // Share b of q
+  );
+
+  aes_dom_dep_mul_gf2pn #(
+    .NPower      ( 2    ),
+    .PreDOMIndep ( 1'b1 )
+  ) aes_dom_mul_omega_gamma0 (
+    .clk_i  ( clk_i            ),
+    .rst_ni ( rst_ni           ),
+    .a_x    ( a_omega          ), // Share a of x
+    .a_y    ( a_gamma0         ), // Share a of y
+    .b_x    ( b_omega          ), // Share b of x
+    .b_y    ( b_gamma0         ), // Share b of y
+    .z_0    ( z_3_0[1:0]       ), // Randomness for blinding
+    .z_1    ( z_3_0[3:2]       ), // Randomness for resharing
+    .a_q    ( a_gamma_inv[3:2] ), // Share a of q
+    .b_q    ( b_gamma_inv[3:2] )  // Share b of q
+  );
+
+endmodule
+
+// Inverse in GF(2^8) using first-order domain-oriented masking and normal basis [y^16, y].
+// See Fig. 6 in [1] and Formulas 3, 12, 18 and 19 in [2].
+module aes_dom_inverse_gf2p8 (
+  input  logic        clk_i,
+  input  logic        rst_ni,
+  input  logic  [7:0] a_y,     // input data masked by b_y
+  input  logic  [7:0] b_y,     // input mask
+  input  logic [27:0] prd,     // pseudo-random data, e.g. for intermediate masks
+  output logic  [7:0] a_y_inv, // output data masked by b_y_inv
+  output logic  [7:0] b_y_inv  // output mask
+);
+
+  import aes_sbox_canright_pkg::*;
+
+  // Distribute the randomness for the various stages.
+  logic  [7:0] z_1;
+  logic [11:0] z_23;
+  logic  [3:0] z_4_0;
+  logic  [3:0] z_4_1;
+  assign z_1   = prd[7:0];
+  assign z_23  = prd[19:8];
+  assign z_4_0 = prd[23:20];
+  assign z_4_1 = prd[27:24];
+
+  /////////////
+  // Stage 1 //
+  /////////////
+  // Formula 12 in [2].
+
+  logic [3:0] a_y1, a_y0, b_y1, b_y0, a_y1_y0, b_y1_y0;
+  (* keep = "true" *) logic [3:0] a_y_ss, b_y_ss, a_gamma, b_gamma;
+  assign a_y1 = a_y[7:4];
+  assign a_y0 = a_y[3:0];
+  assign b_y1 = b_y[7:4];
+  assign b_y0 = b_y[3:0];
+
+  assign a_y_ss = aes_square_scale_gf2p4_gf2p2(a_y1 ^ a_y0);
+  assign b_y_ss = aes_square_scale_gf2p4_gf2p2(b_y1 ^ b_y0);
+
+  aes_dom_dep_mul_gf2pn #(
+    .NPower      ( 4    ),
+    .PreDOMIndep ( 1'b0 )
+  ) aes_dom_mul_y1_y0 (
+    .clk_i  ( clk_i    ),
+    .rst_ni ( rst_ni   ),
+    .a_x    ( a_y1     ), // Share a of x
+    .a_y    ( a_y0     ), // Share a of y
+    .b_x    ( b_y1     ), // Share b of x
+    .b_y    ( b_y0     ), // Share b of y
+    .z_0    ( z_1[3:0] ), // Randomness for blinding
+    .z_1    ( z_1[7:4] ), // Randomness for resharing
+    .a_q    ( a_y1_y0  ), // Share a of q
+    .b_q    ( b_y1_y0  )  // Share b of q
+  );
+
+  assign a_gamma = a_y_ss ^ a_y1_y0;
+  assign b_gamma = b_y_ss ^ b_y1_y0;
+
+  ////////////////////
+  // Stages 2 and 3 //
+  ////////////////////
+
+  logic [3:0] a_theta, b_theta;
+
+  // a_gamma is masked by b_gamma, a_gamma_inv is masked by b_gamma_inv.
+  aes_dom_inverse_gf2p4 aes_dom_inverse_gf2p4 (
+    .clk_i       ( clk_i   ),
+    .rst_ni      ( rst_ni  ),
+    .a_gamma     ( a_gamma ),
+    .b_gamma     ( b_gamma ),
+    .prd         ( z_23    ),
+    .a_gamma_inv ( a_theta ),
+    .b_gamma_inv ( b_theta )
+  );
+
+  /////////////
+  // Stage 4 //
+  /////////////
+  // Formulas 18 and 19 in [2].
+
+  aes_dom_indep_mul_gf2pn #(
+    .NPower ( 4 )
+  ) aes_dom_mul_theta_y1 (
+    .clk_i  ( clk_i        ),
+    .rst_ni ( rst_ni       ),
+    .a_x    ( a_y1         ), // Share a of x
+    .a_y    ( a_theta      ), // Share a of y
+    .b_x    ( b_y1         ), // Share b of x
+    .b_y    ( b_theta      ), // Share b of y
+    .z_0    ( z_4_1        ), // Randomness for resharing
+    .a_q    ( a_y_inv[3:0] ), // Share a of q
+    .b_q    ( b_y_inv[3:0] )  // Share b of q
+  );
+
+  aes_dom_indep_mul_gf2pn #(
+    .NPower ( 4 )
+  ) aes_dom_mul_theta_y0 (
+    .clk_i  ( clk_i        ),
+    .rst_ni ( rst_ni       ),
+    .a_x    ( a_theta      ), // Share a of x
+    .a_y    ( a_y0         ), // Share a of y
+    .b_x    ( b_theta      ), // Share b of x
+    .b_y    ( b_y0         ), // Share b of y
+    .z_0    ( z_4_0        ), // Randomness for resharing
+    .a_q    ( a_y_inv[7:4] ), // Share a of q
+    .b_q    ( b_y_inv[7:4] )  // Share b of q
+  );
+
+endmodule
+
+module aes_sbox_dom (
+  input  logic              clk_i,
+  input  logic              rst_ni,
+  input  logic              en_i,
+  output logic              out_req_o,
+  input  logic              out_ack_i,
+  input  aes_pkg::ciph_op_e op_i,
+  input  logic        [7:0] data_i, // masked, the actual input data is data_i ^ mask_i
+  input  logic        [7:0] mask_i, // input mask
+  input  logic       [27:0] prd_i,  // pseudo-random data for remasking
+  output logic        [7:0] data_o, // masked, the actual output data is data_o ^ mask_o
+  output logic        [7:0] mask_o  // output mask
+);
+
+  import aes_pkg::*;
+  import aes_sbox_canright_pkg::*;
+
+  logic [7:0] in_data_basis_x, out_data_basis_x;
+  logic [7:0] in_mask_basis_x, out_mask_basis_x;
+
+  // Convert data to normal basis X.
+  assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
+                                                aes_mvm(data_i ^ 8'h63, S2X);
+
+  // Convert mask to normal basis X.
+  // The addition of constant 8'h63 prior to the affine transformation is skipped.
+  assign in_mask_basis_x  = (op_i == CIPH_FWD) ? aes_mvm(mask_i, A2X) :
+                                                 aes_mvm(mask_i, S2X);
+
+  // Do the inversion in normal basis X.
+  aes_dom_inverse_gf2p8 aes_dom_inverse_gf2p8 (
+    .clk_i   ( clk_i            ),
+    .rst_ni  ( rst_ni           ),
+    .a_y     ( in_data_basis_x  ), // input
+    .b_y     ( in_mask_basis_x  ), // input
+    .prd     ( prd_i            ), // input
+    .a_y_inv ( out_data_basis_x ), // output
+    .b_y_inv ( out_mask_basis_x )  // output
+  );
+
+  // Convert data to basis S or A.
+  assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
+                                       (aes_mvm(out_data_basis_x, X2A));
+
+  // Convert mask to basis S or A.
+  // The addition of constant 8'h63 following the affine transformation is skipped.
+  assign mask_o = (op_i == CIPH_FWD) ? aes_mvm(out_mask_basis_x, X2S) :
+                                       aes_mvm(out_mask_basis_x, X2A);
+
+  // Counter register
+  logic [2:0] count_d, count_q;
+  assign count_d = (out_req_o && out_ack_i) ? '0               :
+                   out_req_o                ? count_q          :
+                   en_i                     ? count_q + 3'b001 : count_q;
+  always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
+    if (!rst_ni) begin
+      count_q <= '0;
+    end else begin
+      count_q <= count_d;
+    end
+  end
+  assign out_req_o = en_i & count_q == 3'b100;
+
+endmodule