[aes] Add DOM S-Box
This commit adds an S-Box implementation that uses domain-oriented masking.
Signed-off-by: Pirmin Vogel <vogelpi@lowrisc.org>
diff --git a/hw/ip/aes/aes.core b/hw/ip/aes/aes.core
index e12c8ef..437aee3 100644
--- a/hw/ip/aes/aes.core
+++ b/hw/ip/aes/aes.core
@@ -28,6 +28,7 @@
- rtl/aes_sbox_canright.sv
- rtl/aes_sbox_canright_masked_noreuse.sv
- rtl/aes_sbox_canright_masked.sv
+ - rtl/aes_sbox_dom.sv
- rtl/aes_shift_rows.sv
- rtl/aes_mix_columns.sv
- rtl/aes_mix_single_column.sv
diff --git a/hw/ip/aes/lint/aes.vlt b/hw/ip/aes/lint/aes.vlt
index 004c2ba..660f05e 100644
--- a/hw/ip/aes/lint/aes.vlt
+++ b/hw/ip/aes/lint/aes.vlt
@@ -12,3 +12,4 @@
// Masked SBox implementations may require multiple modules to prevent aggressive synthesis optimizations.
lint_off -rule DECLFILENAME -file "*/rtl/aes_sbox_*_masked*.sv" -match "Filename 'aes_sbox_*_masked*' does not match MODULE name: *"
+lint_off -rule DECLFILENAME -file "*/rtl/aes_sbox_dom*.sv" -match "Filename 'aes_sbox_dom*' does not match MODULE name: *"
diff --git a/hw/ip/aes/pre_dv/aes_sbox_lec/aes_sbox_lec.py b/hw/ip/aes/pre_dv/aes_sbox_lec/aes_sbox_lec.py
index be99494..016ab50 100755
--- a/hw/ip/aes/pre_dv/aes_sbox_lec/aes_sbox_lec.py
+++ b/hw/ip/aes/pre_dv/aes_sbox_lec/aes_sbox_lec.py
@@ -33,6 +33,9 @@
impl_list = [
impl_dut.replace(rtl_path, '').replace('.sv', '') for impl_dut in impl_list
]
+# Remove multicycle implementations, we can't perform LEC for those.
+impl_list.remove('aes_sbox_dom')
+# Remove reference implementation and package files.
impl_list.remove(impl_gold)
impl_list.remove(file_pkg_canright)
file_pkg_canright = file_pkg_canright + '.sv'
diff --git a/hw/ip/aes/pre_dv/aes_sbox_tb/rtl/aes_sbox_tb.sv b/hw/ip/aes/pre_dv/aes_sbox_tb/rtl/aes_sbox_tb.sv
index b996cf6..5ab986a 100644
--- a/hw/ip/aes/pre_dv/aes_sbox_tb/rtl/aes_sbox_tb.sv
+++ b/hw/ip/aes/pre_dv/aes_sbox_tb/rtl/aes_sbox_tb.sv
@@ -20,7 +20,7 @@
ciph_op_e op;
localparam int NUM_SBOX_IMPLS = 2;
- localparam int NUM_SBOX_IMPLS_MASKED = 2;
+ localparam int NUM_SBOX_IMPLS_MASKED = 3;
localparam int NumSBoxImplsTotal = NUM_SBOX_IMPLS + NUM_SBOX_IMPLS_MASKED;
logic [7:0] responses[NumSBoxImplsTotal];
@@ -29,7 +29,7 @@
always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
if (!rst_ni) begin
count_q <= '0;
- end else begin
+ end else if (dom_done) begin
count_q <= count_d;
end
end
@@ -53,27 +53,28 @@
// Mask Generation
parameter int unsigned WidthPRDSBoxCanrightMasked = 8;
parameter int unsigned WidthPRDSBoxCanrightMaskedNoreuse = 18;
+ parameter int unsigned WidthPRDSBoxDOM = 28;
- logic [7:0] masked_stimulus;
- logic [7:0] in_mask;
+ logic [7:0] masked_stimulus;
+ logic [7:0] in_mask;
- logic [7:0] masked_response [NUM_SBOX_IMPLS_MASKED];
- logic [7:0] out_mask [NUM_SBOX_IMPLS_MASKED];
+ logic [7:0] masked_response [NUM_SBOX_IMPLS_MASKED];
+ logic [7:0] out_mask [NUM_SBOX_IMPLS_MASKED];
- logic [31:0] tmp;
- logic [31-(WidthPRDSBoxCanrightMaskedNoreuse+8):0] unused_tmp;
- logic [WidthPRDSBoxCanrightMaskedNoreuse-1:0] prd_masking;
+ logic [63:0] tmp;
+ logic [63-(WidthPRDSBoxDOM+8):0] unused_tmp;
+ logic [WidthPRDSBoxDOM-1:0] prd_masking;
always_ff @(posedge clk_i or negedge rst_ni) begin : reg_tmp
if (!rst_ni) begin
- tmp <= 32'hAAAFF;
- end else begin
- tmp <= $random;
+ tmp <= 64'hAAAFF;
+ end else if (dom_done) begin
+ tmp <= {$random, $random};
end
end
assign in_mask = tmp[7:0];
- assign prd_masking = tmp[8 +: WidthPRDSBoxCanrightMaskedNoreuse];
- assign unused_tmp = tmp[31:WidthPRDSBoxCanrightMaskedNoreuse+8];
+ assign prd_masking = tmp[8 +: WidthPRDSBoxDOM];
+ assign unused_tmp = tmp[63:WidthPRDSBoxDOM+8];
assign masked_stimulus = stimulus ^ in_mask;
@@ -96,6 +97,22 @@
.mask_o ( out_mask[1] )
);
+ // Instantiate DOM SBox Implementation
+ logic dom_done;
+ aes_sbox_dom aes_sbox_dom (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .en_i ( 1'b1 ),
+ .out_req_o ( dom_done ),
+ .out_ack_i ( 1'b1 ),
+ .op_i ( op ),
+ .data_i ( masked_stimulus ),
+ .mask_i ( in_mask ),
+ .prd_i ( prd_masking[WidthPRDSBoxDOM-1:0] ),
+ .data_o ( masked_response[2] ),
+ .mask_o ( out_mask[2] )
+ );
+
// Unmask responses
always_comb begin : unmask_resp
for (int i=0; i<NUM_SBOX_IMPLS_MASKED; i++) begin
@@ -109,7 +126,7 @@
test_passed_o <= 1'b1;
for (int i=1; i<NumSBoxImplsTotal; i++) begin
- if (rst_ni && (responses[i] != responses[0])) begin
+ if (rst_ni && dom_done && (responses[i] != responses[0])) begin
$display("\nERROR: Mismatch between LUT-based S-Box and Implementation %0d found.", i);
$display("op = %s, stimulus = 8'h%h, expected resp = 8'h%h, actual resp = 8'h%h\n",
(op == CIPH_FWD) ? "CIPH_FWD" : "CIPH_INV", stimulus, responses[0], responses[i]);
diff --git a/hw/ip/aes/rtl/aes_cipher_core.sv b/hw/ip/aes/rtl/aes_cipher_core.sv
index 00c4050..3711140 100644
--- a/hw/ip/aes/rtl/aes_cipher_core.sv
+++ b/hw/ip/aes/rtl/aes_cipher_core.sv
@@ -636,7 +636,8 @@
`ASSERT_INIT(AesMaskedCoreAndSBox,
(Masking &&
(SBoxImpl == SBoxImplCanrightMasked ||
- SBoxImpl == SBoxImplCanrightMaskedNoreuse)) ||
+ SBoxImpl == SBoxImplCanrightMaskedNoreuse ||
+ SBoxImpl == SBoxImplDom)) ||
(!Masking &&
(SBoxImpl == SBoxImplLut ||
SBoxImpl == SBoxImplCanright)))
diff --git a/hw/ip/aes/rtl/aes_key_expand.sv b/hw/ip/aes/rtl/aes_key_expand.sv
index b326e03..7935aa0 100644
--- a/hw/ip/aes/rtl/aes_key_expand.sv
+++ b/hw/ip/aes/rtl/aes_key_expand.sv
@@ -388,7 +388,8 @@
`ASSERT_INIT(AesMaskedCoreAndSBox,
(Masking &&
(SBoxImpl == SBoxImplCanrightMasked ||
- SBoxImpl == SBoxImplCanrightMaskedNoreuse)) ||
+ SBoxImpl == SBoxImplCanrightMaskedNoreuse ||
+ SBoxImpl == SBoxImplDom)) ||
(!Masking &&
(SBoxImpl == SBoxImplLut ||
SBoxImpl == SBoxImplCanright)))
diff --git a/hw/ip/aes/rtl/aes_pkg.sv b/hw/ip/aes/rtl/aes_pkg.sv
index d1c743a..b4e5a79 100644
--- a/hw/ip/aes/rtl/aes_pkg.sv
+++ b/hw/ip/aes/rtl/aes_pkg.sv
@@ -49,12 +49,14 @@
216'h6587da04c59c02125750f35e7634e08951122874022ce19b143211;
typedef enum integer {
- SBoxImplLut, // Unmasked LUT-based S-Box
- SBoxImplCanright, // Unmasked Canright S-Box, see aes_sbox_canright.sv
- SBoxImplCanrightMasked, // First-order masked Canright S-Box
- // see aes_sbox_canright_masked.sv
- SBoxImplCanrightMaskedNoreuse // First-order masked Canright S-Box without mask reuse,
- // see aes_sbox_canright_masked_noreuse.sv
+ SBoxImplLut, // Unmasked LUT-based S-Box
+ SBoxImplCanright, // Unmasked Canright S-Box, see aes_sbox_canright.sv
+ SBoxImplCanrightMasked, // First-order masked Canright S-Box
+ // see aes_sbox_canright_masked.sv
+ SBoxImplCanrightMaskedNoreuse, // First-order masked Canright S-Box without mask reuse,
+ // see aes_sbox_canright_masked_noreuse.sv
+ SBoxImplDom // First-order masked S-Box using domain-oriented masking,
+ // see aes_sbox_canright_dom.sv
} sbox_impl_e;
typedef enum logic {
diff --git a/hw/ip/aes/rtl/aes_sbox.sv b/hw/ip/aes/rtl/aes_sbox.sv
index b3dc17b..ac0e2cd 100644
--- a/hw/ip/aes/rtl/aes_sbox.sv
+++ b/hw/ip/aes/rtl/aes_sbox.sv
@@ -23,9 +23,10 @@
import aes_pkg::*;
localparam bit SBoxMasked = (SBoxImpl == SBoxImplCanrightMasked ||
- SBoxImpl == SBoxImplCanrightMaskedNoreuse) ? 1'b1 : 1'b0;
+ SBoxImpl == SBoxImplCanrightMaskedNoreuse ||
+ SBoxImpl == SBoxImplDom) ? 1'b1 : 1'b0;
- localparam bit SBoxSingleCycle = 1'b1;
+ localparam bit SBoxSingleCycle = (SBoxImpl == SBoxImplDom) ? 1'b0 : 1'b1;
if (!SBoxMasked) begin : gen_sbox_unmasked
// Tie off unused inputs.
@@ -57,7 +58,22 @@
end else begin : gen_sbox_masked
- if (SBoxImpl == SBoxImplCanrightMaskedNoreuse) begin : gen_sbox_canright_masked_noreuse
+ if (SBoxImpl == SBoxImplDom) begin : gen_sbox_dom
+ aes_sbox_dom u_aes_sbox (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .en_i ( en_i ),
+ .out_req_o ( out_req_o ),
+ .out_ack_i ( out_ack_i ),
+ .op_i ( op_i ),
+ .data_i ( data_i ),
+ .mask_i ( mask_i ),
+ .prd_i ( prd_i[27:0] ),
+ .data_o ( data_o ),
+ .mask_o ( mask_o )
+ );
+
+ end else if (SBoxImpl == SBoxImplCanrightMaskedNoreuse) begin : gen_sbox_canright_masked_noreuse
// Tie off unused inputs.
logic unused_clk;
logic unused_rst;
@@ -107,25 +123,6 @@
// Signal that we have valid output right away.
assign out_req_o = en_i;
- end else begin : gen_req_multicycle
-
- // All currently implemented S-Boxes allow for single-cycle operation. Future S-Box
- // implementations may require multiple clock cycles. The counter below is for mimicking such
- // implementations. It's for testing purposes only.
-
- // Counter register
- logic [2:0] count_d, count_q;
- assign count_d = (out_req_o && out_ack_i) ? '0 :
- out_req_o ? count_q :
- en_i ? count_q + 3'b001 : count_q;
- always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
- if (!rst_ni) begin
- count_q <= '0;
- end else begin
- count_q <= count_d;
- end
- end
- assign out_req_o = en_i & count_q == 3'b111;
end
endmodule
diff --git a/hw/ip/aes/rtl/aes_sbox_dom.sv b/hw/ip/aes/rtl/aes_sbox_dom.sv
new file mode 100644
index 0000000..2f4fdec
--- /dev/null
+++ b/hw/ip/aes/rtl/aes_sbox_dom.sv
@@ -0,0 +1,657 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+//
+// AES S-Box with First-Order Domain-Oriented Masking
+//
+// This is the unpipelined version using DOM-dep multipliers. It has a latency of 5 clock cycles
+// and requires 28 bits of pseudo-random data per evaluation. Pipelining would only be beneficial
+// when using
+// - either a cipher core architecture with a data path smaller than 128 bit, i.e., where the
+// individual S-Boxes are evaluated more than once per round, or
+// - a fully unrolled cipher core architecture for maximum throughput.
+//
+// Note: The DOM AES S-Box is built on top of the Canright masked S-Box without mask re-use.
+//
+// For details, see the following papers and reports:
+// [1] Gross, "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary
+// Protection Order" available at https://eprint.iacr.org/2016/486.pdf
+// [2] Canright, "A very compact 'perfectly masked' S-box for AES (corrected)" available at
+// https://eprint.iacr.org/2009/011.pdf
+// [3] Canright, "A very compact Rijndael S-box" available at https://hdl.handle.net/10945/25608
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// IMPORTANT NOTE: //
+// DO NOT USE THIS FOR SYNTHESIS BLINDLY! //
+// //
+// This implementation targets primarily Xilinx Vivado synthesis as well as RTL simulation. It //
+// contains synthesis attributes specific to Xilinx Vivado to prevent the synthesis tool from //
+// optimizing away registers and to enforce the correct ordering of operations. Other synthesis //
+// tools might still heavily optimize the design. The result is likely insecure. Use with care. //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "prim_assert.sv"
+
+// DOM-indep GF(2^N) multiplier, unpipelined, first-order masked.
+// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
+// domain-oriented masking. The sharings of x and y are required to be uniformly random and
+// independent from each other.
+// See Fig. 2 in [1].
+module aes_dom_indep_mul_gf2pn #(
+ parameter int unsigned NPower = 4
+) (
+ input logic clk_i,
+ input logic rst_ni,
+ input logic [NPower-1:0] a_x, // Share a of x
+ input logic [NPower-1:0] a_y, // Share a of y
+ input logic [NPower-1:0] b_x, // Share b of x
+ input logic [NPower-1:0] b_y, // Share b of y
+ input logic [NPower-1:0] z_0, // Randomness for resharing
+ output logic [NPower-1:0] a_q, // Share a of q
+ output logic [NPower-1:0] b_q // Share b of q
+);
+
+ import aes_sbox_canright_pkg::*;
+
+ /////////////////
+ // Calculation //
+ /////////////////
+ // Inner-domain terms
+ (* keep = "true" *) logic [NPower-1:0] mul_ax_ay, mul_bx_by;
+ if (NPower == 4) begin : gen_inner_mul_gf2p4
+ assign mul_ax_ay = aes_mul_gf2p4(a_x, a_y);
+ assign mul_bx_by = aes_mul_gf2p4(b_x, b_y);
+
+ end else begin : gen_inner_mul_gf2p2
+ assign mul_ax_ay = aes_mul_gf2p2(a_x, a_y);
+ assign mul_bx_by = aes_mul_gf2p2(b_x, b_y);
+ end
+
+ // Cross-domain terms
+ logic [NPower-1:0] mul_ax_by, mul_ay_bx;
+ if (NPower == 4) begin : gen_cross_mul_gf2p4
+ assign mul_ax_by = aes_mul_gf2p4(a_x, b_y);
+ assign mul_ay_bx = aes_mul_gf2p4(a_y, b_x);
+
+ end else begin : gen_cross_mul_gf2p2
+ assign mul_ax_by = aes_mul_gf2p2(a_x, b_y);
+ assign mul_ay_bx = aes_mul_gf2p2(a_y, b_x);
+ end
+
+ ///////////////
+ // Resharing //
+ ///////////////
+ // Resharing of cross-domain terms
+ (* keep = "true" *) logic [NPower-1:0] aq_z0_d, aq_z0_q;
+ (* keep = "true" *) logic [NPower-1:0] bq_z0_d, bq_z0_q;
+ assign aq_z0_d = z_0 ^ mul_ax_by;
+ assign bq_z0_d = z_0 ^ mul_ay_bx;
+
+ // Registers
+ always_ff @(posedge clk_i or negedge rst_ni) begin
+ if (!rst_ni) begin
+ aq_z0_q <= '0;
+ bq_z0_q <= '0;
+ end else begin
+ aq_z0_q <= aq_z0_d;
+ bq_z0_q <= bq_z0_d;
+ end
+ end
+
+ /////////////////
+ // Integration //
+ /////////////////
+ assign a_q = mul_ax_ay ^ aq_z0_q;
+ assign b_q = mul_bx_by ^ bq_z0_q;
+
+ // Only GF(2^4) and GF(2^2) is supported.
+ `ASSERT_INIT(AesDomIndepMulPower, NPower == 4 || NPower == 2)
+
+endmodule
+
+// DOM-dep GF(2^N) multiplier, unpipelined, first-order masked.
+// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
+// domain-oriented masking. The sharings of x and y are NOT required to be independent from each
+// other. This is the un-optimized version consuming 3 times N bits of randomness for blinding and
+// resharing. It is not used in the design but we keep it for reference.
+// See Fig. 4 and Formulas 8 - 11 in [1].
+module aes_dom_dep_mul_gf2pn_unopt #(
+ parameter int unsigned NPower = 4
+) (
+ input logic clk_i,
+ input logic rst_ni,
+ input logic [NPower-1:0] a_x, // Share a of x
+ input logic [NPower-1:0] a_y, // Share a of y
+ input logic [NPower-1:0] b_x, // Share b of x
+ input logic [NPower-1:0] b_y, // Share b of y
+ input logic [NPower-1:0] a_z, // Randomness for blinding
+ input logic [NPower-1:0] b_z, // Randomness for blinding
+ input logic [NPower-1:0] z_0, // Randomness for resharing
+ output logic [NPower-1:0] a_q, // Share a of q
+ output logic [NPower-1:0] b_q // Share b of q
+);
+
+ import aes_sbox_canright_pkg::*;
+
+ //////////////
+ // Blinding //
+ //////////////
+ // Blinding of y by z.
+ (* keep = "true" *) logic [NPower-1:0] a_yz_d, a_yz_q;
+ (* keep = "true" *) logic [NPower-1:0] b_yz_d, b_yz_q;
+ assign a_yz_d = a_y ^ a_z;
+ assign b_yz_d = b_y ^ b_z;
+
+ // Registers
+ always_ff @(posedge clk_i or negedge rst_ni) begin
+ if (!rst_ni) begin
+ a_yz_q <= '0;
+ b_yz_q <= '0;
+ end else begin
+ a_yz_q <= a_yz_d;
+ b_yz_q <= b_yz_d;
+ end
+ end
+
+ ////////////////
+ // Correction //
+ ////////////////
+ logic [NPower-1:0] a_mul_x_z, b_mul_x_z;
+ aes_dom_indep_mul_gf2pn #(
+ .NPower ( NPower )
+ ) aes_dom_indep_mul_gf2pn (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_x ( a_x ), // Share a of x
+ .a_y ( a_z ), // Share a of z
+ .b_x ( b_x ), // Share b of x
+ .b_y ( b_z ), // Share b of z
+ .z_0 ( z_0 ), // Randomness for resharing
+ .a_q ( a_mul_x_z ), // Share a of x * z
+ .b_q ( b_mul_x_z ) // Share b of x * z
+ );
+
+ /////////////////
+ // Calculation //
+ /////////////////
+ // Combine shares of blinded y to obtain b.
+ logic [NPower-1:0] b;
+ assign b = a_yz_q ^ b_yz_q;
+
+ logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
+ if (NPower == 4) begin : gen_mul_gf2p4
+ assign a_mul_ax_b = aes_mul_gf2p4(a_x, b);
+ assign b_mul_bx_b = aes_mul_gf2p4(b_x, b);
+
+ end else begin : gen_mul_gf2p2
+ assign a_mul_ax_b = aes_mul_gf2p2(a_x, b);
+ assign b_mul_bx_b = aes_mul_gf2p2(b_x, b);
+ end
+
+ /////////////////
+ // Integration //
+ /////////////////
+ assign a_q = a_mul_x_z ^ a_mul_ax_b;
+ assign b_q = b_mul_x_z ^ b_mul_bx_b;
+
+ // Only GF(2^4) and GF(2^2) is supported.
+ `ASSERT_INIT(AesDomDepMulUnoptPower, NPower == 4 || NPower == 2)
+
+endmodule
+
+// DOM-dep GF(2^N) multiplier, unpipelined, first-order masked.
+// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
+// domain-oriented masking. The sharings of x and y are NOT required to be independent from each
+// other. This is the optimized version consuming 2 instead of 3 times N bits of randomness for
+// blinding and resharing.
+// See Formula 12 in [1].
+module aes_dom_dep_mul_gf2pn #(
+ parameter int unsigned NPower = 4,
+ parameter bit PreDOMIndep = 1'b0 // 1'b0: Not followed by a DOM-indep multiplier, this
+ // enables additional area optimizations
+ // 1'b1: Directly followed by a DOM-indep multiplier,
+ // this is the version discussed in [1].
+) (
+ input logic clk_i,
+ input logic rst_ni,
+ input logic [NPower-1:0] a_x, // Share a of x
+ input logic [NPower-1:0] a_y, // Share a of y
+ input logic [NPower-1:0] b_x, // Share b of x
+ input logic [NPower-1:0] b_y, // Share b of y
+ input logic [NPower-1:0] z_0, // Randomness for blinding
+ input logic [NPower-1:0] z_1, // Randomness for resharing
+ output logic [NPower-1:0] a_q, // Share a of q
+ output logic [NPower-1:0] b_q // Share b of q
+);
+
+ import aes_sbox_canright_pkg::*;
+
+ //////////////
+ // Blinding //
+ //////////////
+ // Blinding of y by z_0.
+ (* keep = "true" *) logic [NPower-1:0] a_yz0_d, a_yz0_q;
+ (* keep = "true" *) logic [NPower-1:0] b_yz0_d, b_yz0_q;
+ assign a_yz0_d = a_y ^ z_0;
+ assign b_yz0_d = b_y ^ z_0;
+
+ // Registers
+ always_ff @(posedge clk_i or negedge rst_ni) begin
+ if (!rst_ni) begin
+ a_yz0_q <= '0;
+ b_yz0_q <= '0;
+ end else begin
+ a_yz0_q <= a_yz0_d;
+ b_yz0_q <= b_yz0_d;
+ end
+ end
+
+ ////////////////
+ // Correction //
+ ////////////////
+ // Basically, this a DOM-indep multiplier with:
+ // - a_x = a_x, b_x = b_x, and
+ // - a_y = z_0, b_y = 0 (constant),
+ // which allows for further optimizations.
+
+ // Calculation
+ (* keep = "true" *) logic [NPower-1:0] mul_ax_z0, mul_bx_z0;
+ if (NPower == 4) begin : gen_corr_mul_gf2p4
+ assign mul_ax_z0 = aes_mul_gf2p4(a_x, z_0);
+ assign mul_bx_z0 = aes_mul_gf2p4(b_x, z_0);
+
+ end else begin : gen_corr_mul_gf2p2
+ assign mul_ax_z0 = aes_mul_gf2p2(a_x, z_0);
+ assign mul_bx_z0 = aes_mul_gf2p2(b_x, z_0);
+ end
+
+ // Resharing
+ (* keep = "true" *) logic [NPower-1:0] axz0_z1_d, axz0_z1_q;
+ (* keep = "true" *) logic [NPower-1:0] bxz0_z1_d, bxz0_z1_q;
+ assign axz0_z1_d = mul_ax_z0 ^ z_1;
+ assign bxz0_z1_d = mul_bx_z0 ^ z_1;
+
+ // Registers
+ always_ff @(posedge clk_i or negedge rst_ni) begin
+ if (!rst_ni) begin
+ axz0_z1_q <= '0;
+ bxz0_z1_q <= '0;
+ end else begin
+ axz0_z1_q <= axz0_z1_d;
+ bxz0_z1_q <= bxz0_z1_d;
+ end
+ end
+
+ ///////////////////////////////
+ // Calculation & Integration //
+ ///////////////////////////////
+ // Compute b. Note that unlike for the unoptimized implementation, we don't combine the blinded
+ // shares of y to obtain a single b value. Intstead, every domain d gets its own version of b:
+ //
+ // d_b = d_y ^ _D_y_z0
+ //
+ // where _D_y_z0 corresponds to the sum of all domains of y except for domain d, each
+ // individually blinded by z0 (needs to happen before the register bank). This optimization
+ // is only suitable for first-order masking.
+ // See Formula 12 in [1].
+
+ if (PreDOMIndep == 1'b1) begin : gen_pre_dom_indep
+ // This DOM-dep multiplier is directly followed by a DOM-indep multiplier without an additional
+ // pipeline stage in between. To prevent SCA leakage in the DOM-indep multiplier, the d_y and
+ // _D_y_z0 parts of d_b need to be individually multiplied with input x and then the results
+ // need to be integrated (summed up) on a per-domain basis.
+
+ // d_y part: Inner-domain terms of x * y
+ (* keep = "true" *) logic [NPower-1:0] mul_ax_ay_d, mul_ax_ay_q;
+ (* keep = "true" *) logic [NPower-1:0] mul_bx_by_d, mul_bx_by_q;
+ if (NPower == 4) begin : gen_inner_mul_gf2p4
+ assign mul_ax_ay_d = aes_mul_gf2p4(a_x, a_y);
+ assign mul_bx_by_d = aes_mul_gf2p4(b_x, b_y);
+
+ end else begin : gen_inner_mul_gf2p2
+ assign mul_ax_ay_d = aes_mul_gf2p2(a_x, a_y);
+ assign mul_bx_by_d = aes_mul_gf2p2(b_x, b_y);
+ end
+
+ // Registers
+ always_ff @(posedge clk_i or negedge rst_ni) begin
+ if (!rst_ni) begin
+ mul_ax_ay_q <= '0;
+ mul_bx_by_q <= '0;
+ end else begin
+ mul_ax_ay_q <= mul_ax_ay_d;
+ mul_bx_by_q <= mul_bx_by_d;
+ end
+ end
+
+ // Input Registers
+ (* keep = "true" *) logic [NPower-1:0] a_x_q;
+ (* keep = "true" *) logic [NPower-1:0] b_x_q;
+ always_ff @(posedge clk_i or negedge rst_ni) begin
+ if (!rst_ni) begin
+ a_x_q <= '0;
+ b_x_q <= '0;
+ end else begin
+ a_x_q <= a_x;
+ b_x_q <= b_x;
+ end
+ end
+
+ // _D_y_z0 part: Cross-domain terms: d_x * _D_y_z0
+ // Need to use registered version of input x.
+ (* keep = "true" *) logic [NPower-1:0] mul_ax_byz0, mul_bx_ayz0;
+ if (NPower == 4) begin : gen_cross_mul_gf2p4
+ assign mul_ax_byz0 = aes_mul_gf2p4(a_x_q, b_yz0_q);
+ assign mul_bx_ayz0 = aes_mul_gf2p4(b_x_q, a_yz0_q);
+
+ end else begin : gen_cross_mul_gf2p2
+ assign mul_ax_byz0 = aes_mul_gf2p2(a_x_q, b_yz0_q);
+ assign mul_bx_ayz0 = aes_mul_gf2p2(b_x_q, a_yz0_q);
+ end
+
+ // Integration
+ assign a_q = axz0_z1_q ^ mul_ax_ay_q ^ mul_ax_byz0;
+ assign b_q = bxz0_z1_q ^ mul_bx_by_q ^ mul_bx_ayz0;
+
+ end else begin : gen_not_pre_dom_indep
+ // This DOM-dep multiplier is not directly followed by a DOM-indep multiplier. As a result,
+ // the the d_y and _D_y_z0 parts of d_b can be summed up prior to the multiplication with input
+ // x which helps saving 2 GF multipliers and 4 registers (NPower flops each).
+
+ // Sum up d_y and _D_y_z0.
+ (* keep = "true" *) logic [NPower-1:0] a_b, b_b;
+ assign a_b = a_y ^ b_yz0_q;
+ assign b_b = b_y ^ a_yz0_q;
+
+ // GF multiplications
+ (* keep = "true" *) logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
+ if (NPower == 4) begin : gen_mul_gf2p4
+ assign a_mul_ax_b = aes_mul_gf2p4(a_x, a_b);
+ assign b_mul_bx_b = aes_mul_gf2p4(b_x, b_b);
+ end else begin : gen_mul_gf2p2
+ assign a_mul_ax_b = aes_mul_gf2p2(a_x, a_b);
+ assign b_mul_bx_b = aes_mul_gf2p2(b_x, b_b);
+ end
+
+ // Integration
+ assign a_q = axz0_z1_q ^ a_mul_ax_b;
+ assign b_q = bxz0_z1_q ^ b_mul_bx_b;
+ end
+
+ // Only GF(2^4) and GF(2^2) is supported.
+ `ASSERT_INIT(AesDomDepMulPower, NPower == 4 || NPower == 2)
+
+endmodule
+
+// Inverse in GF(2^4) using first-order domain-oriented masking and normal basis [z^4, z].
+// See Fig. 6 in [2] (grey block, Stages 2 and 3) and Formulas 6, 13, 14, 15, 16, 17 in [2].
+module aes_dom_inverse_gf2p4 (
+ input logic clk_i,
+ input logic rst_ni,
+ input logic [3:0] a_gamma,
+ input logic [3:0] b_gamma,
+ input logic [11:0] prd,
+ output logic [3:0] a_gamma_inv,
+ output logic [3:0] b_gamma_inv
+);
+
+ import aes_sbox_canright_pkg::*;
+
+ // Distribute the randomness for the various multiplers.
+ logic [3:0] z_2;
+ logic [3:0] z_3_1;
+ logic [3:0] z_3_0;
+ assign z_2 = prd[3:0];
+ assign z_3_0 = prd[7:4];
+ assign z_3_1 = prd[11:8];
+
+ /////////////
+ // Stage 2 //
+ /////////////
+ // Formula 13 in [2].
+
+ logic [1:0] a_gamma1, a_gamma0, b_gamma1, b_gamma0, a_gamma1_gamma0, b_gamma1_gamma0;
+ (* keep = "true" *) logic [1:0] a_gamma_ss, b_gamma_ss;
+ assign a_gamma1 = a_gamma[3:2];
+ assign a_gamma0 = a_gamma[1:0];
+ assign b_gamma1 = b_gamma[3:2];
+ assign b_gamma0 = b_gamma[1:0];
+
+ assign a_gamma_ss = aes_scale_omega2_gf2p2(aes_square_gf2p2(a_gamma1 ^ a_gamma0));
+ assign b_gamma_ss = aes_scale_omega2_gf2p2(aes_square_gf2p2(b_gamma1 ^ b_gamma0));
+
+ aes_dom_dep_mul_gf2pn #(
+ .NPower ( 2 ),
+ .PreDOMIndep ( 1'b0 )
+ ) aes_dom_mul_gamma1_gamma0 (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_x ( a_gamma1 ), // Share a of x
+ .a_y ( a_gamma0 ), // Share a of y
+ .b_x ( b_gamma1 ), // Share b of x
+ .b_y ( b_gamma0 ), // Share b of y
+ .z_0 ( z_2[1:0] ), // Randomness for blinding
+ .z_1 ( z_2[3:2] ), // Randomness for resharing
+ .a_q ( a_gamma1_gamma0 ), // Share a of q
+ .b_q ( b_gamma1_gamma0 ) // Share b of q
+ );
+
+ /////////////
+ // Stage 3 //
+ /////////////
+
+ // Formulas 14 and 15 in [2].
+ (* keep = "true" *) logic [1:0] a_omega, b_omega;
+ assign a_omega = aes_square_gf2p2(a_gamma1_gamma0 ^ a_gamma_ss);
+ assign b_omega = aes_square_gf2p2(b_gamma1_gamma0 ^ b_gamma_ss);
+
+ // Formulas 16 and 17 in [2].
+
+ aes_dom_dep_mul_gf2pn #(
+ .NPower ( 2 ),
+ .PreDOMIndep ( 1'b1 )
+ ) aes_dom_mul_omega_gamma1 (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_x ( a_gamma1 ), // Share a of x
+ .a_y ( a_omega ), // Share a of y
+ .b_x ( b_gamma1 ), // Share b of x
+ .b_y ( b_omega ), // Share b of y
+ .z_0 ( z_3_1[1:0] ), // Randomness for blinding
+ .z_1 ( z_3_1[3:2] ), // Randomness for resharing
+ .a_q ( a_gamma_inv[1:0] ), // Share a of q
+ .b_q ( b_gamma_inv[1:0] ) // Share b of q
+ );
+
+ aes_dom_dep_mul_gf2pn #(
+ .NPower ( 2 ),
+ .PreDOMIndep ( 1'b1 )
+ ) aes_dom_mul_omega_gamma0 (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_x ( a_omega ), // Share a of x
+ .a_y ( a_gamma0 ), // Share a of y
+ .b_x ( b_omega ), // Share b of x
+ .b_y ( b_gamma0 ), // Share b of y
+ .z_0 ( z_3_0[1:0] ), // Randomness for blinding
+ .z_1 ( z_3_0[3:2] ), // Randomness for resharing
+ .a_q ( a_gamma_inv[3:2] ), // Share a of q
+ .b_q ( b_gamma_inv[3:2] ) // Share b of q
+ );
+
+endmodule
+
+// Inverse in GF(2^8) using first-order domain-oriented masking and normal basis [y^16, y].
+// See Fig. 6 in [1] and Formulas 3, 12, 18 and 19 in [2].
+module aes_dom_inverse_gf2p8 (
+ input logic clk_i,
+ input logic rst_ni,
+ input logic [7:0] a_y, // input data masked by b_y
+ input logic [7:0] b_y, // input mask
+ input logic [27:0] prd, // pseudo-random data, e.g. for intermediate masks
+ output logic [7:0] a_y_inv, // output data masked by b_y_inv
+ output logic [7:0] b_y_inv // output mask
+);
+
+ import aes_sbox_canright_pkg::*;
+
+ // Distribute the randomness for the various stages.
+ logic [7:0] z_1;
+ logic [11:0] z_23;
+ logic [3:0] z_4_0;
+ logic [3:0] z_4_1;
+ assign z_1 = prd[7:0];
+ assign z_23 = prd[19:8];
+ assign z_4_0 = prd[23:20];
+ assign z_4_1 = prd[27:24];
+
+ /////////////
+ // Stage 1 //
+ /////////////
+ // Formula 12 in [2].
+
+ logic [3:0] a_y1, a_y0, b_y1, b_y0, a_y1_y0, b_y1_y0;
+ (* keep = "true" *) logic [3:0] a_y_ss, b_y_ss, a_gamma, b_gamma;
+ assign a_y1 = a_y[7:4];
+ assign a_y0 = a_y[3:0];
+ assign b_y1 = b_y[7:4];
+ assign b_y0 = b_y[3:0];
+
+ assign a_y_ss = aes_square_scale_gf2p4_gf2p2(a_y1 ^ a_y0);
+ assign b_y_ss = aes_square_scale_gf2p4_gf2p2(b_y1 ^ b_y0);
+
+ aes_dom_dep_mul_gf2pn #(
+ .NPower ( 4 ),
+ .PreDOMIndep ( 1'b0 )
+ ) aes_dom_mul_y1_y0 (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_x ( a_y1 ), // Share a of x
+ .a_y ( a_y0 ), // Share a of y
+ .b_x ( b_y1 ), // Share b of x
+ .b_y ( b_y0 ), // Share b of y
+ .z_0 ( z_1[3:0] ), // Randomness for blinding
+ .z_1 ( z_1[7:4] ), // Randomness for resharing
+ .a_q ( a_y1_y0 ), // Share a of q
+ .b_q ( b_y1_y0 ) // Share b of q
+ );
+
+ assign a_gamma = a_y_ss ^ a_y1_y0;
+ assign b_gamma = b_y_ss ^ b_y1_y0;
+
+ ////////////////////
+ // Stages 2 and 3 //
+ ////////////////////
+
+ logic [3:0] a_theta, b_theta;
+
+ // a_gamma is masked by b_gamma, a_gamma_inv is masked by b_gamma_inv.
+ aes_dom_inverse_gf2p4 aes_dom_inverse_gf2p4 (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_gamma ( a_gamma ),
+ .b_gamma ( b_gamma ),
+ .prd ( z_23 ),
+ .a_gamma_inv ( a_theta ),
+ .b_gamma_inv ( b_theta )
+ );
+
+ /////////////
+ // Stage 4 //
+ /////////////
+ // Formulas 18 and 19 in [2].
+
+ aes_dom_indep_mul_gf2pn #(
+ .NPower ( 4 )
+ ) aes_dom_mul_theta_y1 (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_x ( a_y1 ), // Share a of x
+ .a_y ( a_theta ), // Share a of y
+ .b_x ( b_y1 ), // Share b of x
+ .b_y ( b_theta ), // Share b of y
+ .z_0 ( z_4_1 ), // Randomness for resharing
+ .a_q ( a_y_inv[3:0] ), // Share a of q
+ .b_q ( b_y_inv[3:0] ) // Share b of q
+ );
+
+ aes_dom_indep_mul_gf2pn #(
+ .NPower ( 4 )
+ ) aes_dom_mul_theta_y0 (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_x ( a_theta ), // Share a of x
+ .a_y ( a_y0 ), // Share a of y
+ .b_x ( b_theta ), // Share b of x
+ .b_y ( b_y0 ), // Share b of y
+ .z_0 ( z_4_0 ), // Randomness for resharing
+ .a_q ( a_y_inv[7:4] ), // Share a of q
+ .b_q ( b_y_inv[7:4] ) // Share b of q
+ );
+
+endmodule
+
+module aes_sbox_dom (
+ input logic clk_i,
+ input logic rst_ni,
+ input logic en_i,
+ output logic out_req_o,
+ input logic out_ack_i,
+ input aes_pkg::ciph_op_e op_i,
+ input logic [7:0] data_i, // masked, the actual input data is data_i ^ mask_i
+ input logic [7:0] mask_i, // input mask
+ input logic [27:0] prd_i, // pseudo-random data for remasking
+ output logic [7:0] data_o, // masked, the actual output data is data_o ^ mask_o
+ output logic [7:0] mask_o // output mask
+);
+
+ import aes_pkg::*;
+ import aes_sbox_canright_pkg::*;
+
+ logic [7:0] in_data_basis_x, out_data_basis_x;
+ logic [7:0] in_mask_basis_x, out_mask_basis_x;
+
+ // Convert data to normal basis X.
+ assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
+ aes_mvm(data_i ^ 8'h63, S2X);
+
+ // Convert mask to normal basis X.
+ // The addition of constant 8'h63 prior to the affine transformation is skipped.
+ assign in_mask_basis_x = (op_i == CIPH_FWD) ? aes_mvm(mask_i, A2X) :
+ aes_mvm(mask_i, S2X);
+
+ // Do the inversion in normal basis X.
+ aes_dom_inverse_gf2p8 aes_dom_inverse_gf2p8 (
+ .clk_i ( clk_i ),
+ .rst_ni ( rst_ni ),
+ .a_y ( in_data_basis_x ), // input
+ .b_y ( in_mask_basis_x ), // input
+ .prd ( prd_i ), // input
+ .a_y_inv ( out_data_basis_x ), // output
+ .b_y_inv ( out_mask_basis_x ) // output
+ );
+
+ // Convert data to basis S or A.
+ assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
+ (aes_mvm(out_data_basis_x, X2A));
+
+ // Convert mask to basis S or A.
+ // The addition of constant 8'h63 following the affine transformation is skipped.
+ assign mask_o = (op_i == CIPH_FWD) ? aes_mvm(out_mask_basis_x, X2S) :
+ aes_mvm(out_mask_basis_x, X2A);
+
+ // Counter register
+ logic [2:0] count_d, count_q;
+ assign count_d = (out_req_o && out_ack_i) ? '0 :
+ out_req_o ? count_q :
+ en_i ? count_q + 3'b001 : count_q;
+ always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
+ if (!rst_ni) begin
+ count_q <= '0;
+ end else begin
+ count_q <= count_d;
+ end
+ end
+ assign out_req_o = en_i & count_q == 3'b100;
+
+endmodule