blob: 08a92c3a8af26eb37ea550dd058c5589aaaf090a [file] [log] [blame]
// Copyright lowRISC contributors.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// AES S-Box with First-Order Domain-Oriented Masking
//
// This is the unpipelined version using DOM-dep multipliers. It has a latency of 5 clock cycles
// and requires 28 bits of pseudo-random data per evaluation. Pipelining would only be beneficial
// when using
// - either a cipher core architecture with a data path smaller than 128 bit, i.e., where the
// individual S-Boxes are evaluated more than once per round, or
// - a fully unrolled cipher core architecture for maximum throughput.
//
// Note: The DOM AES S-Box is built on top of the Canright masked S-Box without mask re-use.
//
// For details, see the following papers and reports:
// [1] Gross, "Domain-Oriented Masking: Compact Masked Hardware Implementations with Arbitrary
// Protection Order" available at https://eprint.iacr.org/2016/486.pdf
// [2] Canright, "A very compact 'perfectly masked' S-box for AES (corrected)" available at
// https://eprint.iacr.org/2009/011.pdf
// [3] Canright, "A very compact Rijndael S-box" available at https://hdl.handle.net/10945/25608
//
// Using the Coco-Alma tool in transient mode, this implementation has been formally verified to be
// secure against first-order side-channel analysis (SCA). For more information on the tool,
// refer to the following papers:
// [4] Gigerl, "COCO: Co-design and co-verification of masked software implementations on CPUs"
// available at https://eprint.iacr.org/2020/1294.pdf
// [5] Bloem, "Formal verification of masked hardware implementations in the presence of glitches"
// available at https://eprint.iacr.org/2017/897.pdf
///////////////////////////////////////////////////////////////////////////////////////////////////
// IMPORTANT NOTE: //
// DO NOT USE THIS FOR SYNTHESIS BLINDLY! //
// //
// This implementation relies on primitive cells like prim_buf/flop_en containing tool-specific //
// synthesis attributes to prevent the synthesis tool from optimizing away/re-ordering registers //
// and to enforce the correct ordering of operations. Without the proper primitives, synthesis //
// tools might heavily optimize the design. The result is likely insecure. Use with care. //
///////////////////////////////////////////////////////////////////////////////////////////////////
`include "prim_assert.sv"
// Packed struct for pseudo-random data (PRD) input. Stages 1, 3 and 4 require 8 bits each. Stage 2
// requires just 4 bits.
typedef struct packed {
logic [7:0] prd_1;
logic [3:0] prd_2;
logic [7:0] prd_3;
logic [7:0] prd_4;
} prd_in_t;
// Packed struct for pseudo-random data (PRD) output. Stages 2 and 3 produce 8 bits each. Stage 1
// produces just 4 bits.
typedef struct packed {
logic [3:0] prd_1;
logic [7:0] prd_2;
logic [7:0] prd_3;
} prd_out_t;
// DOM-indep GF(2^N) multiplier, first-order masked.
// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
// domain-oriented masking. The sharings of x and y are required to be uniformly random and
// independent from each other.
// See Fig. 2 in [1].
module aes_dom_indep_mul_gf2pn #(
parameter int unsigned NPower = 4,
parameter bit Pipeline = 1'b0
) (
input logic clk_i,
input logic rst_ni,
input logic we_i,
input logic [NPower-1:0] a_x, // Share a of x
input logic [NPower-1:0] a_y, // Share a of y
input logic [NPower-1:0] b_x, // Share b of x
input logic [NPower-1:0] b_y, // Share b of y
input logic [NPower-1:0] z_0, // Randomness for resharing
output logic [NPower-1:0] a_q, // Share a of q
output logic [NPower-1:0] b_q // Share b of q
);
import aes_sbox_canright_pkg::*;
/////////////////
// Calculation //
/////////////////
// Inner-domain terms
logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
if (NPower == 4) begin : gen_inner_mul_gf2p4
assign mul_ax_ay_d = aes_mul_gf2p4(a_x, a_y);
assign mul_bx_by_d = aes_mul_gf2p4(b_x, b_y);
end else begin : gen_inner_mul_gf2p2
assign mul_ax_ay_d = aes_mul_gf2p2(a_x, a_y);
assign mul_bx_by_d = aes_mul_gf2p2(b_x, b_y);
end
// Cross-domain terms
logic [NPower-1:0] mul_ax_by, mul_ay_bx;
if (NPower == 4) begin : gen_cross_mul_gf2p4
assign mul_ax_by = aes_mul_gf2p4(a_x, b_y);
assign mul_ay_bx = aes_mul_gf2p4(a_y, b_x);
end else begin : gen_cross_mul_gf2p2
assign mul_ax_by = aes_mul_gf2p2(a_x, b_y);
assign mul_ay_bx = aes_mul_gf2p2(a_y, b_x);
end
///////////////
// Resharing //
///////////////
// Resharing of cross-domain terms
logic [NPower-1:0] aq_z0_d, bq_z0_d;
logic [NPower-1:0] aq_z0_q, bq_z0_q;
assign aq_z0_d = z_0 ^ mul_ax_by;
assign bq_z0_d = z_0 ^ mul_ay_bx;
// Registers
prim_flop_en #(
.Width ( 2*NPower ),
.ResetValue ( '0 )
) u_prim_flop_abq_z0 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i ),
.d_i ( {aq_z0_d, bq_z0_d} ),
.q_o ( {aq_z0_q, bq_z0_q} )
);
/////////////////////////
// Optional Pipelining //
/////////////////////////
logic [NPower-1:0] mul_ax_ay, mul_bx_by;
if (Pipeline == 1'b1) begin : gen_pipeline
// Add pipeline registers on inner-domain terms prior to integration. This allows accepting new
// input data every clock cycle and prevents SCA leakage occurring due to the integration of
// reshared cross-domain terms with inner-domain terms derived from different input data.
logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
prim_flop_en #(
.Width ( 2*NPower ),
.ResetValue ( '0 )
) u_prim_flop_mul_abx_aby (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i ),
.d_i ( {mul_ax_ay_d, mul_bx_by_d} ),
.q_o ( {mul_ax_ay_q, mul_bx_by_q} )
);
assign mul_ax_ay = mul_ax_ay_q;
assign mul_bx_by = mul_bx_by_q;
end else begin : gen_no_pipeline
// Do not add the optional pipeline registers on the inner-domain terms. This allows to save
// some area in case the multiplier does not need to accept new data in every cycle. However,
// this can cause SCA leakage as during the clock cycle in which new data arrives, the new
// inner-domain terms are integrated with the previous, reshared cross-domain terms.
// Avoid aggressive synthesis optimizations.
logic [NPower-1:0] mul_ax_ay_buf, mul_bx_by_buf;
prim_buf #(
.Width ( 2*NPower )
) u_prim_buf_mul_abx_aby (
.in_i ( {mul_ax_ay_d, mul_bx_by_d} ),
.out_o ( {mul_ax_ay_buf, mul_bx_by_buf} )
);
assign mul_ax_ay = mul_ax_ay_buf;
assign mul_bx_by = mul_bx_by_buf;
end
/////////////////
// Integration //
/////////////////
assign a_q = mul_ax_ay ^ aq_z0_q;
assign b_q = mul_bx_by ^ bq_z0_q;
// Only GF(2^4) and GF(2^2) is supported.
`ASSERT_INIT(AesDomIndepMulPower, NPower == 4 || NPower == 2)
endmodule
// DOM-dep GF(2^N) multiplier, first-order masked.
// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
// domain-oriented masking. The sharings of x and y are NOT required to be independent from each
// other. This is the un-optimized version consuming 3 times N bits of randomness for blinding and
// resharing. It is not used in the design but we keep it for reference.
// See Fig. 4 and Formulas 8 - 11 in [1].
module aes_dom_dep_mul_gf2pn_unopt #(
parameter int unsigned NPower = 4,
parameter bit Pipeline = 1'b0
) (
input logic clk_i,
input logic rst_ni,
input logic we_i,
input logic [NPower-1:0] a_x, // Share a of x
input logic [NPower-1:0] a_y, // Share a of y
input logic [NPower-1:0] b_x, // Share b of x
input logic [NPower-1:0] b_y, // Share b of y
input logic [NPower-1:0] a_z, // Randomness for blinding
input logic [NPower-1:0] b_z, // Randomness for blinding
input logic [NPower-1:0] z_0, // Randomness for resharing
output logic [NPower-1:0] a_q, // Share a of q
output logic [NPower-1:0] b_q // Share b of q
);
import aes_sbox_canright_pkg::*;
//////////////
// Blinding //
//////////////
// Blinding of y by z.
logic [NPower-1:0] a_yz_d, b_yz_d;
logic [NPower-1:0] a_yz_q, b_yz_q;
assign a_yz_d = a_y ^ a_z;
assign b_yz_d = b_y ^ b_z;
// Registers
prim_flop_en #(
.Width ( 2*NPower ),
.ResetValue ( '0 )
) u_prim_flop_ab_yz (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i ),
.d_i ( {a_yz_d, b_yz_d} ),
.q_o ( {a_yz_q, b_yz_q} )
);
////////////////
// Correction //
////////////////
logic [NPower-1:0] a_mul_x_z, b_mul_x_z;
aes_dom_indep_mul_gf2pn #(
.NPower ( NPower ),
.Pipeline ( Pipeline )
) u_aes_dom_indep_mul_gf2pn (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we_i ),
.a_x ( a_x ), // Share a of x
.a_y ( a_z ), // Share a of z
.b_x ( b_x ), // Share b of x
.b_y ( b_z ), // Share b of z
.z_0 ( z_0 ), // Randomness for resharing
.a_q ( a_mul_x_z ), // Share a of x * z
.b_q ( b_mul_x_z ) // Share b of x * z
);
/////////////////////////
// Optional Pipelining //
/////////////////////////
logic [NPower-1:0] a_x_calc, b_x_calc;
if (Pipeline == 1'b1) begin : gen_pipeline
// Add pipeline registers for input x. This allows accepting new input data every clock cycle
// and prevents SCA leakage occurring due to the multiplication of input x with b belonging to
// different clock cycles.
logic [NPower-1:0] a_x_q, b_x_q;
prim_flop_en #(
.Width ( 2*NPower ),
.ResetValue ( '0 )
) u_prim_flop_ab_x (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i ),
.d_i ( {a_x, b_x} ),
.q_o ( {a_x_q, b_x_q} )
);
assign a_x_calc = a_x_q;
assign b_x_calc = b_x_q;
end else begin : gen_no_pipeline
// Do not add the optional pipeline registers for input x. This allows to save some area in
// case the multiplier does not need to accept new data in every cycle. However, this can cause
// SCA leakage as during the clock cycle in which new data arrives, the new x input is
// multiplied with the previous b.
assign a_x_calc = a_x;
assign b_x_calc = b_x;
end
/////////////////
// Calculation //
/////////////////
// Combine shares of blinded y to obtain b.
logic [NPower-1:0] b;
assign b = a_yz_q ^ b_yz_q;
logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
if (NPower == 4) begin : gen_mul_gf2p4
assign a_mul_ax_b = aes_mul_gf2p4(a_x_calc, b);
assign b_mul_bx_b = aes_mul_gf2p4(b_x_calc, b);
end else begin : gen_mul_gf2p2
assign a_mul_ax_b = aes_mul_gf2p2(a_x_calc, b);
assign b_mul_bx_b = aes_mul_gf2p2(b_x_calc, b);
end
/////////////////
// Integration //
/////////////////
assign a_q = a_mul_x_z ^ a_mul_ax_b;
assign b_q = b_mul_x_z ^ b_mul_bx_b;
// Only GF(2^4) and GF(2^2) is supported.
`ASSERT_INIT(AesDomDepMulUnoptPower, NPower == 4 || NPower == 2)
endmodule
// DOM-dep GF(2^N) multiplier, first-order masked.
// Computes (a_q ^ b_q) = (a_x ^ b_x) * (a_y ^ b_y), i.e. q = x * y using first-order
// domain-oriented masking. The sharings of x and y are NOT required to be independent from each
// other. This is the optimized version consuming 2 instead of 3 times N bits of randomness for
// blinding and resharing.
// See Formula 12 in [1].
module aes_dom_dep_mul_gf2pn #(
parameter int unsigned NPower = 4,
parameter bit Pipeline = 1'b0,
parameter bit PreDomIndep = 1'b0 // 1'b0: Not followed by an un-pipelined DOM-indep
// multiplier, this enables additional area
// optimizations
// 1'b1: Directly followed by an un-pipelined
// DOM-indep multiplier, this is the version
// discussed in [1].
) (
input logic clk_i,
input logic rst_ni,
input logic we_i,
input logic [NPower-1:0] a_x, // Share a of x
input logic [NPower-1:0] a_y, // Share a of y
input logic [NPower-1:0] b_x, // Share b of x
input logic [NPower-1:0] b_y, // Share b of y
input logic [NPower-1:0] a_x_q, // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
input logic [NPower-1:0] a_y_q, // Share a of y, pipelined (for Pipeline=1)
input logic [NPower-1:0] b_x_q, // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
input logic [NPower-1:0] b_y_q, // Share b of y, pipelined (for Pipeline=1)
input logic [NPower-1:0] z_0, // Randomness for blinding
input logic [NPower-1:0] z_1, // Randomness for resharing
output logic [NPower-1:0] a_q, // Share a of q
output logic [NPower-1:0] b_q, // Share b of q
output logic [2*NPower-1:0] prd_o // Randomness for use in another S-Box instance
);
import aes_sbox_canright_pkg::*;
//////////////
// Blinding //
//////////////
// Blinding of y by z_0.
logic [NPower-1:0] a_yz0_d, b_yz0_d;
logic [NPower-1:0] a_yz0_q, b_yz0_q;
assign a_yz0_d = a_y ^ z_0;
assign b_yz0_d = b_y ^ z_0;
// Registers
prim_flop_en #(
.Width ( 2*NPower ),
.ResetValue ( '0 )
) u_prim_flop_ab_yz0 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i ),
.d_i ( {a_yz0_d, b_yz0_d} ),
.q_o ( {a_yz0_q, b_yz0_q} )
);
////////////////
// Correction //
////////////////
// Basically, this a DOM-indep multiplier with:
// - a_x = a_x, b_x = b_x, and
// - a_y = z_0, b_y = 0 (constant),
// which allows for further optimizations.
// Calculation
logic [NPower-1:0] mul_ax_z0, mul_bx_z0;
if (NPower == 4) begin : gen_corr_mul_gf2p4
assign mul_ax_z0 = aes_mul_gf2p4(a_x, z_0);
assign mul_bx_z0 = aes_mul_gf2p4(b_x, z_0);
end else begin : gen_corr_mul_gf2p2
assign mul_ax_z0 = aes_mul_gf2p2(a_x, z_0);
assign mul_bx_z0 = aes_mul_gf2p2(b_x, z_0);
end
// Avoid aggressive synthesis optimizations.
logic [NPower-1:0] mul_ax_z0_buf, mul_bx_z0_buf;
prim_buf #(
.Width ( 2*NPower )
) u_prim_buf_mul_abx_z0 (
.in_i ( {mul_ax_z0, mul_bx_z0} ),
.out_o ( {mul_ax_z0_buf, mul_bx_z0_buf} )
);
// Resharing
logic [NPower-1:0] axz0_z1_d, bxz0_z1_d;
logic [NPower-1:0] axz0_z1_q, bxz0_z1_q;
assign axz0_z1_d = mul_ax_z0_buf ^ z_1;
assign bxz0_z1_d = mul_bx_z0_buf ^ z_1;
// Registers
prim_flop_en #(
.Width ( 2*NPower ),
.ResetValue ( '0 )
) u_prim_flop_abxz0_z1 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i ),
.d_i ( {axz0_z1_d, bxz0_z1_d} ),
.q_o ( {axz0_z1_q, bxz0_z1_q} )
);
// Use intermediate results for generating PRD for another S-Box instance.
// Use one share only. Directly use output of flops updating with we_i.
// These intermediate results are obtained by remasking b_y and mul_bx_z0 with z_0 and z_1,
// respectively. Since z_0/1 are uniformly distributed and independent of b_y and mul_bx_z0,
// the intermediate results are also uniformly distributed and independent of b_y and mul_bx_z0.
// For details, see Lemma 1 in [2].
assign prd_o = {b_yz0_q, bxz0_z1_q};
/////////////////////////
// Optional Pipelining //
/////////////////////////
logic [NPower-1:0] a_x_calc, b_x_calc, a_y_calc, b_y_calc;
if (Pipeline == 1'b1 && PreDomIndep != 1'b1) begin : gen_pipeline_use
// Use pipelined inputs x and y. This allows accepting new input data every clock cycle and
// prevents SCA leakage occurring due to the multiplication of inputs x and y with d_b
// belonging to different clock cycles.
//
// The PreDomIndep variant uses the pipelined inputs directly.
assign a_x_calc = a_x_q;
assign b_x_calc = b_x_q;
assign a_y_calc = a_y_q;
assign b_y_calc = b_y_q;
end else begin : gen_no_pipeline_use
// Do not use pipelined inputs x and y. This allows to save some area in case the multiplier
// does not need to accept new data in every cycle. However, this can cause SCA leakage as
// during the clock cycle in which new data arrives, the new x and y inputs are multiplied
// with the previous d_b.
assign a_x_calc = a_x;
assign b_x_calc = b_x;
assign a_y_calc = a_y;
assign b_y_calc = b_y;
// Tie off unused signals.
if (PreDomIndep != 1'b1) begin : gen_ab_x_q
logic [NPower-1:0] unused_a_x_q, unused_b_x_q;
assign unused_a_x_q = a_x_q;
assign unused_b_x_q = b_x_q;
end
logic [NPower-1:0] unused_a_y_q, unused_b_y_q;
assign unused_a_y_q = a_y_q;
assign unused_b_y_q = b_y_q;
end
///////////////////////////////
// Calculation & Integration //
///////////////////////////////
// Compute b. Note that unlike for the unoptimized implementation, we don't combine the blinded
// shares of y to obtain a single b value. Intstead, every domain d gets its own version of b:
//
// d_b = d_y ^ _D_y_z0
//
// where _D_y_z0 corresponds to the sum of all domains of y except for domain d, each
// individually blinded by z0 (needs to happen before the register bank). This optimization
// is only suitable for first-order masking.
// See Formula 12 in [1].
if (PreDomIndep == 1'b1) begin : gen_pre_dom_indep
// This DOM-dep multiplier is directly followed by an un-pipelined DOM-indep multiplier. To
// prevent SCA leakage in the un-pipelined DOM-indep multiplier, the d_y and _D_y_z0 parts of
// d_b need to be individually multiplied with input x and then the results need to be
// integrated (summed up) on a per-domain basis.
// d_y part: Inner-domain terms of x * y
logic [NPower-1:0] mul_ax_ay_d, mul_bx_by_d;
logic [NPower-1:0] mul_ax_ay_q, mul_bx_by_q;
if (NPower == 4) begin : gen_inner_mul_gf2p4
assign mul_ax_ay_d = aes_mul_gf2p4(a_x_calc, a_y_calc);
assign mul_bx_by_d = aes_mul_gf2p4(b_x_calc, b_y_calc);
end else begin : gen_inner_mul_gf2p2
assign mul_ax_ay_d = aes_mul_gf2p2(a_x_calc, a_y_calc);
assign mul_bx_by_d = aes_mul_gf2p2(b_x_calc, b_y_calc);
end
// Registers
prim_flop_en #(
.Width ( 2*NPower ),
.ResetValue ( '0 )
) u_prim_flop_mul_abx_aby (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i ),
.d_i ( {mul_ax_ay_d, mul_bx_by_d} ),
.q_o ( {mul_ax_ay_q, mul_bx_by_q} )
);
// _D_y_z0 part: Cross-domain terms: d_x * _D_y_z0
// Need to use registered version of input x.
logic [NPower-1:0] mul_ax_byz0, mul_bx_ayz0;
if (NPower == 4) begin : gen_cross_mul_gf2p4
assign mul_ax_byz0 = aes_mul_gf2p4(a_x_q, b_yz0_q);
assign mul_bx_ayz0 = aes_mul_gf2p4(b_x_q, a_yz0_q);
end else begin : gen_cross_mul_gf2p2
assign mul_ax_byz0 = aes_mul_gf2p2(a_x_q, b_yz0_q);
assign mul_bx_ayz0 = aes_mul_gf2p2(b_x_q, a_yz0_q);
end
// Avoid aggressive synthesis optimizations.
logic [NPower-1:0] mul_ax_byz0_buf, mul_bx_ayz0_buf;
prim_buf #(
.Width ( 2*NPower )
) u_prim_buf_mul_abx_bayz0 (
.in_i ( {mul_ax_byz0, mul_bx_ayz0} ),
.out_o ( {mul_ax_byz0_buf, mul_bx_ayz0_buf} )
);
// Integration
assign a_q = axz0_z1_q ^ mul_ax_ay_q ^ mul_ax_byz0_buf;
assign b_q = bxz0_z1_q ^ mul_bx_by_q ^ mul_bx_ayz0_buf;
end else begin : gen_not_pre_dom_indep
// This DOM-dep multiplier is not directly followed by an un-pipelined DOM-indep multiplier. As
// a result, the the d_y and _D_y_z0 parts of d_b can be summed up prior to the multiplication
// with input x which allows saving 2 GF multipliers.
// Sum up d_y and _D_y_z0.
logic [NPower-1:0] a_b, b_b;
assign a_b = a_y_calc ^ b_yz0_q;
assign b_b = b_y_calc ^ a_yz0_q;
// Avoid aggressive synthesis optimizations.
logic [NPower-1:0] a_b_buf, b_b_buf;
prim_buf #(
.Width ( 2*NPower )
) u_prim_buf_ab_b (
.in_i ( {a_b, b_b} ),
.out_o ( {a_b_buf, b_b_buf} )
);
// GF multiplications
logic [NPower-1:0] a_mul_ax_b, b_mul_bx_b;
if (NPower == 4) begin : gen_mul_gf2p4
assign a_mul_ax_b = aes_mul_gf2p4(a_x_calc, a_b_buf);
assign b_mul_bx_b = aes_mul_gf2p4(b_x_calc, b_b_buf);
end else begin : gen_mul_gf2p2
assign a_mul_ax_b = aes_mul_gf2p2(a_x_calc, a_b_buf);
assign b_mul_bx_b = aes_mul_gf2p2(b_x_calc, b_b_buf);
end
// Avoid aggressive synthesis optimizations.
logic [NPower-1:0] a_mul_ax_b_buf, b_mul_bx_b_buf;
prim_buf #(
.Width ( 2*NPower )
) u_prim_buf_ab_mul_abx_b (
.in_i ( {a_mul_ax_b, b_mul_bx_b} ),
.out_o ( {a_mul_ax_b_buf, b_mul_bx_b_buf} )
);
// Integration
assign a_q = axz0_z1_q ^ a_mul_ax_b_buf;
assign b_q = bxz0_z1_q ^ b_mul_bx_b_buf;
end
// Only GF(2^4) and GF(2^2) is supported.
`ASSERT_INIT(AesDomDepMulPower, NPower == 4 || NPower == 2)
endmodule
// Inverse in GF(2^4) using first-order domain-oriented masking and normal basis [z^4, z].
// See Fig. 6 in [2] (grey block, Stages 2 and 3) and Formulas 6, 13, 14, 15, 16, 17 in [2].
module aes_dom_inverse_gf2p4 #(
parameter bit PipelineMul = 1'b1
) (
input logic clk_i,
input logic rst_ni,
input logic [1:0] we_i,
input logic [3:0] a_gamma,
input logic [3:0] b_gamma,
input logic [3:0] prd_2_i,
input logic [7:0] prd_3_i,
output logic [3:0] a_gamma_inv,
output logic [3:0] b_gamma_inv,
output logic [7:0] prd_2_o,
output logic [7:0] prd_3_o
);
import aes_sbox_canright_pkg::*;
/////////////
// Stage 2 //
/////////////
// Formula 13 in [2].
logic [1:0] a_gamma1, a_gamma0, b_gamma1, b_gamma0, a_gamma1_gamma0, b_gamma1_gamma0;
assign a_gamma1 = a_gamma[3:2];
assign a_gamma0 = a_gamma[1:0];
assign b_gamma1 = b_gamma[3:2];
assign b_gamma0 = b_gamma[1:0];
logic [1:0] a_gamma_ss_d, b_gamma_ss_d;
logic [1:0] a_gamma_ss_q, b_gamma_ss_q;
assign a_gamma_ss_d = aes_scale_omega2_gf2p2(aes_square_gf2p2(a_gamma1 ^ a_gamma0));
assign b_gamma_ss_d = aes_scale_omega2_gf2p2(aes_square_gf2p2(b_gamma1 ^ b_gamma0));
prim_flop_en #(
.Width ( 4 ),
.ResetValue ( '0 )
) u_prim_flop_ab_gamma_ss (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i[0] ),
.d_i ( {a_gamma_ss_d, b_gamma_ss_d} ),
.q_o ( {a_gamma_ss_q, b_gamma_ss_q} )
);
logic [1:0] a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q;
prim_flop_en #(
.Width ( 8 ),
.ResetValue ( '0 )
) u_prim_flop_ab_gamma10 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i[0] ),
.d_i ( {a_gamma1, a_gamma0, b_gamma1, b_gamma0} ),
.q_o ( {a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q} )
);
logic [3:0] b_gamma10_prd2;
aes_dom_dep_mul_gf2pn #(
.NPower ( 2 ),
.Pipeline ( PipelineMul ),
.PreDomIndep ( 1'b0 )
) u_aes_dom_mul_gamma1_gamma0 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we_i[0] ),
.a_x ( a_gamma1 ), // Share a of x
.a_y ( a_gamma0 ), // Share a of y
.b_x ( b_gamma1 ), // Share b of x
.b_y ( b_gamma0 ), // Share b of y
.a_x_q ( a_gamma1_q ), // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
.a_y_q ( a_gamma0_q ), // Share a of y, pipelined (for Pipeline=1)
.b_x_q ( b_gamma1_q ), // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
.b_y_q ( b_gamma0_q ), // Share b of y, pipelined (for Pipeline=1)
.z_0 ( prd_2_i[1:0] ), // Randomness for blinding
.z_1 ( prd_2_i[3:2] ), // Randomness for resharing
.a_q ( a_gamma1_gamma0 ), // Share a of q
.b_q ( b_gamma1_gamma0 ), // Share b of q
.prd_o ( b_gamma10_prd2 ) // Randomness for use in another S-Box instance
);
// Use intermediate results for generating PRD for Stage 3 of another S-Box instance.
// Use one share only. Directly use output of flops updating with we_i[0].
// b_gamma10_prd2 is based on b_gamma1_q, b_gamma0_q but XORed with prd_2_i, thus uniformly
// distributed and independent of b_gamma1/0_q (See Lemma 1 in [2]).
//
// In Stage 3 of another S-Box instance, the MSBs and LSBs of the term below are used:
// 1. as randomness in the DOM-dep multipliers u_aes_dom_mul_omega_gamma1/0, and
// 2. to generate randomness for the DOM-indep multipliers u_aes_dom_mul_theta_y1/0 in Stage 4 of
// yet another S-Box instance, respectively.
// Without interleaving b_gamma1/0_q as well as the upper and lower halves of b_gamma10_prd2 here,
// a glitch on the write-enable signal on the input pipeline register of these DOM-indep
// multipliers may result in undesirable SCA leakage.
assign prd_2_o = {b_gamma1_q, b_gamma10_prd2[3:2], b_gamma0_q, b_gamma10_prd2[1:0]};
/////////////
// Stage 3 //
/////////////
// Formulas 14 and 15 in [2].
logic [1:0] a_omega, b_omega;
assign a_omega = aes_square_gf2p2(a_gamma1_gamma0 ^ a_gamma_ss_q);
assign b_omega = aes_square_gf2p2(b_gamma1_gamma0 ^ b_gamma_ss_q);
// Avoid aggressive synthesis optimizations.
logic [1:0] a_omega_buf, b_omega_buf;
prim_buf #(
.Width ( 4 )
) u_prim_buf_ab_omega (
.in_i ( {a_omega, b_omega} ),
.out_o ( {a_omega_buf, b_omega_buf} )
);
// Pipeline registers
logic [1:0] a_gamma1_qq, a_gamma0_qq, b_gamma1_qq, b_gamma0_qq, a_omega_buf_q, b_omega_buf_q;
if (PipelineMul == 1'b1) begin: gen_prim_flop_omega_gamma10
// We instantiate the input pipeline registers for the DOM-dep multiplier outside of the
// multiplier to enable sharing of pipeline registers where applicable.
prim_flop_en #(
.Width ( 8 ),
.ResetValue ( '0 )
) u_prim_flop_ab_gamma10_q (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i[1] ),
.d_i ( {a_gamma1_q, a_gamma0_q, b_gamma1_q, b_gamma0_q} ),
.q_o ( {a_gamma1_qq, a_gamma0_qq, b_gamma1_qq, b_gamma0_qq} )
);
// These inputs are used by both DOM-dep multipliers below.
prim_flop_en #(
.Width ( 4 ),
.ResetValue ( '0 )
) u_prim_flop_ab_omega_buf (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i[1] ),
.d_i ( {a_omega_buf, b_omega_buf} ),
.q_o ( {a_omega_buf_q, b_omega_buf_q} )
);
end else begin : gen_no_prim_flop_ab_y10
// When using un-pipelined multipliers, there is no need to insert additional registers.
// We drive the corresponding inputs to 0 to make sure the functionality isn't correct in case
// the pipeliend inputs are erroneously used.
assign a_gamma1_qq = '0;
assign a_gamma0_qq = '0;
assign b_gamma1_qq = '0;
assign b_gamma0_qq = '0;
assign a_omega_buf_q = '0;
assign b_omega_buf_q = '0;
end
// Formulas 16 and 17 in [2].
logic [3:0] b_gamma1_omega_prd3;
aes_dom_dep_mul_gf2pn #(
.NPower ( 2 ),
.Pipeline ( PipelineMul ),
.PreDomIndep ( 1'b0 )
) u_aes_dom_mul_omega_gamma1 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we_i[1] ),
.a_x ( a_gamma1_q ), // Share a of x
.a_y ( a_omega_buf ), // Share a of y
.b_x ( b_gamma1_q ), // Share b of x
.b_y ( b_omega_buf ), // Share b of y
.a_x_q ( a_gamma1_qq ), // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
.a_y_q ( a_omega_buf_q ), // Share a of y, pipelined (for Pipeline=1)
.b_x_q ( b_gamma1_qq ), // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
.b_y_q ( b_omega_buf_q ), // Share b of y, pipelined (for Pipeline=1)
.z_0 ( prd_3_i[5:4] ), // Randomness for blinding
.z_1 ( prd_3_i[7:6] ), // Randomness for resharing
.a_q ( a_gamma_inv[1:0] ), // Share a of q
.b_q ( b_gamma_inv[1:0] ), // Share b of q
.prd_o ( b_gamma1_omega_prd3 ) // Randomness for use in another S-Box instance
);
logic [3:0] b_gamma0_omega_prd3;
aes_dom_dep_mul_gf2pn #(
.NPower ( 2 ),
.Pipeline ( PipelineMul ),
.PreDomIndep ( 1'b0 )
) u_aes_dom_mul_omega_gamma0 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we_i[1] ),
.a_x ( a_omega_buf ), // Share a of x
.a_y ( a_gamma0_q ), // Share a of y
.b_x ( b_omega_buf ), // Share b of x
.b_y ( b_gamma0_q ), // Share b of y
.a_x_q ( a_omega_buf_q ), // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
.a_y_q ( a_gamma0_qq ), // Share a of y, pipelined (for Pipeline=1)
.b_x_q ( b_omega_buf_q ), // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
.b_y_q ( b_gamma0_qq ), // Share b of y, pipelined (for Pipeline=1)
.z_0 ( prd_3_i[1:0] ), // Randomness for blinding
.z_1 ( prd_3_i[3:2] ), // Randomness for resharing
.a_q ( a_gamma_inv[3:2] ), // Share a of q
.b_q ( b_gamma_inv[3:2] ), // Share b of q
.prd_o ( b_gamma0_omega_prd3 ) // Randomness for use in another S-Box instance
);
// Use intermediate results for generating PRD for Stage 4 of another S-Box instance.
// Use one share only. Directly use output of flops updating with we_i[1].
// b_gamma1/0_omega_prd3 are both based on b_omega but XORed with differend parts of prd_3_i,
// thus uniformly distributed and independent of b_omega (see Lemma 1 in [2]).
assign prd_3_o = {b_gamma1_omega_prd3, b_gamma0_omega_prd3};
endmodule
// Inverse in GF(2^8) using first-order domain-oriented masking and normal basis [y^16, y].
// See Fig. 6 in [1] and Formulas 3, 12, 18 and 19 in [2].
module aes_dom_inverse_gf2p8 #(
parameter bit PipelineMul = 1'b1
) (
input logic clk_i,
input logic rst_ni,
input logic [3:0] we_i,
input logic [7:0] a_y, // input data masked by b_y
input logic [7:0] b_y, // input mask
input prd_in_t prd_i, // pseudo-random data, e.g. for intermediate masks
output logic [7:0] a_y_inv, // output data masked by b_y_inv
output logic [7:0] b_y_inv, // output mask
output prd_out_t prd_o // pseudo-random data, e.g. for use in another S-Box instance
);
import aes_sbox_canright_pkg::*;
/////////////
// Stage 1 //
/////////////
// Formula 12 in [2].
logic [3:0] a_y1, a_y0, b_y1, b_y0, a_y1_y0, b_y1_y0;
assign a_y1 = a_y[7:4];
assign a_y0 = a_y[3:0];
assign b_y1 = b_y[7:4];
assign b_y0 = b_y[3:0];
logic [3:0] a_y_ss_d, b_y_ss_d;
logic [3:0] a_y_ss_q, b_y_ss_q;
assign a_y_ss_d = aes_square_scale_gf2p4_gf2p2(a_y1 ^ a_y0);
assign b_y_ss_d = aes_square_scale_gf2p4_gf2p2(b_y1 ^ b_y0);
prim_flop_en #(
.Width ( 8 ),
.ResetValue ( '0 )
) u_prim_flop_ab_y_ss (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i[0] ),
.d_i ( {a_y_ss_d, b_y_ss_d} ),
.q_o ( {a_y_ss_q, b_y_ss_q} )
);
logic [3:0] a_y1_q, a_y0_q, b_y1_q, b_y0_q;
if (PipelineMul == 1'b1) begin: gen_prim_flop_ab_y10
// We instantiate the input pipeline registers for the DOM-dep multiplier outside of the
// multiplier to enable sharing of pipeline registers where applicable.
prim_flop_en #(
.Width ( 16 ),
.ResetValue ( '0 )
) u_prim_flop_ab_y10 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i[0] ),
.d_i ( {a_y1, a_y0, b_y1, b_y0} ),
.q_o ( {a_y1_q, a_y0_q, b_y1_q, b_y0_q} )
);
end else begin : gen_no_prim_flop_ab_y10
// When using un-pipelined multipliers, there is no need to insert additional registers.
// We drive the corresponding inputs to 0 to make sure the functionality isn't correct in case
// the pipeliend inputs are erroneously used.
assign a_y1_q = '0;
assign a_y0_q = '0;
assign b_y1_q = '0;
assign b_y0_q = '0;
end
logic [7:0] b_y10_prd1;
aes_dom_dep_mul_gf2pn #(
.NPower ( 4 ),
.Pipeline ( PipelineMul ),
.PreDomIndep ( 1'b0 )
) u_aes_dom_mul_y1_y0 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we_i[0] ),
.a_x ( a_y1 ), // Share a of x
.a_y ( a_y0 ), // Share a of y
.b_x ( b_y1 ), // Share b of x
.b_y ( b_y0 ), // Share b of y
.a_x_q ( a_y1_q ), // Share a of x, pipelined (for Pipeline=1 or PreDomIndep=1)
.a_y_q ( a_y0_q ), // Share a of y, pipelined (for Pipeline=1)
.b_x_q ( b_y1_q ), // Share b of x, pipelined (for Pipeline=1 or PreDomIndep=1)
.b_y_q ( b_y0_q ), // Share b of y, pipelined (for Pipeline=1)
.z_0 ( prd_i.prd_1[3:0] ), // Randomness for blinding
.z_1 ( prd_i.prd_1[7:4] ), // Randomness for resharing
.a_q ( a_y1_y0 ), // Share a of q
.b_q ( b_y1_y0 ), // Share b of q
.prd_o ( b_y10_prd1 ) // Randomness for use in another S-Box instance
);
logic [3:0] a_gamma, b_gamma;
assign a_gamma = a_y_ss_q ^ a_y1_y0;
assign b_gamma = b_y_ss_q ^ b_y1_y0;
// Avoid aggressive synthesis optimizations.
logic [3:0] a_gamma_buf, b_gamma_buf;
prim_buf #(
.Width ( 8 )
) u_prim_buf_ab_gamma (
.in_i ( {a_gamma, b_gamma} ),
.out_o ( {a_gamma_buf, b_gamma_buf} )
);
// Use intermediate results for generating PRD for Stage 2 of another S-Box instance.
// Use one share only. Directly use output of flops updating with we_i[0].
// b_y10_prd1 is based on b_y and XORed with prd_1. We just use the lower part involving a
// non-linear element.
assign prd_o.prd_1 = b_y10_prd1[3:0];
logic [3:0] unused_prd;
assign unused_prd = b_y10_prd1[7:4];
////////////////////
// Stages 2 and 3 //
////////////////////
logic [3:0] a_theta, b_theta;
// a_gamma is masked by b_gamma, a_gamma_inv is masked by b_gamma_inv.
aes_dom_inverse_gf2p4 #(
.PipelineMul ( PipelineMul )
) u_aes_dom_inverse_gf2p4 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we_i[2:1] ),
.a_gamma ( a_gamma_buf ),
.b_gamma ( b_gamma_buf ),
.prd_2_i ( prd_i.prd_2 ),
.prd_3_i ( prd_i.prd_3 ),
.a_gamma_inv ( a_theta ),
.b_gamma_inv ( b_theta ),
.prd_2_o ( prd_o.prd_2 ),
.prd_3_o ( prd_o.prd_3 )
);
/////////////
// Stage 4 //
/////////////
// Formulas 18 and 19 in [2].
logic [3:0] a_y1_qqq, a_y0_qqq, b_y1_qqq, b_y0_qqq;
prim_flop_en #(
.Width ( 16 ),
.ResetValue ( '0 )
) u_prim_flop_ab_y10_qqq (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( we_i[2] ),
.d_i ( {a_y1, a_y0, b_y1, b_y0} ),
.q_o ( {a_y1_qqq, a_y0_qqq, b_y1_qqq, b_y0_qqq} )
);
aes_dom_indep_mul_gf2pn #(
.NPower ( 4 ),
.Pipeline ( PipelineMul )
) u_aes_dom_mul_theta_y1 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we_i[3] ),
.a_x ( a_y1_qqq ), // Share a of x
.a_y ( a_theta ), // Share a of y
.b_x ( b_y1_qqq ), // Share b of x
.b_y ( b_theta ), // Share b of y
.z_0 ( prd_i.prd_4[7:4] ), // Randomness for resharing
.a_q ( a_y_inv[3:0] ), // Share a of q
.b_q ( b_y_inv[3:0] ) // Share b of q
);
aes_dom_indep_mul_gf2pn #(
.NPower ( 4 ),
.Pipeline ( PipelineMul )
) u_aes_dom_mul_theta_y0 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we_i[3] ),
.a_x ( a_theta ), // Share a of x
.a_y ( a_y0_qqq ), // Share a of y
.b_x ( b_theta ), // Share b of x
.b_y ( b_y0_qqq ), // Share b of y
.z_0 ( prd_i.prd_4[3:0] ), // Randomness for resharing
.a_q ( a_y_inv[7:4] ), // Share a of q
.b_q ( b_y_inv[7:4] ) // Share b of q
);
endmodule
// SEC_CM: KEY.MASKING
module aes_sbox_dom
#(
parameter bit PipelineMul = 1'b1
) (
input logic clk_i,
input logic rst_ni,
input logic en_i,
output logic out_req_o,
input logic out_ack_i,
input aes_pkg::ciph_op_e op_i,
input logic [7:0] data_i, // masked, the actual input data is data_i ^ mask_i
input logic [7:0] mask_i, // input mask
input logic [27:0] prd_i, // pseudo-random data for remasking, in total we need 28 bits
// of PRD per evaluation, but at most 8 bits per cycle
output logic [7:0] data_o, // masked, the actual output data is data_o ^ mask_o
output logic [7:0] mask_o, // output mask
output logic [19:0] prd_o // PRD for usage in Stages 2 - 4 of other S-Box instances
);
import aes_pkg::*;
import aes_sbox_canright_pkg::*;
logic [7:0] in_data_basis_x, out_data_basis_x;
logic [7:0] in_mask_basis_x, out_mask_basis_x;
logic [3:0] we;
logic [7:0] prd1_d, prd1_q;
prd_in_t in_prd;
prd_out_t out_prd;
// Convert data to normal basis X.
assign in_data_basis_x = (op_i == CIPH_FWD) ? aes_mvm(data_i, A2X) :
(op_i == CIPH_INV) ? aes_mvm(data_i ^ 8'h63, S2X) :
aes_mvm(data_i, A2X);
// Convert mask to normal basis X.
// The addition of constant 8'h63 prior to the affine transformation is skipped.
assign in_mask_basis_x = (op_i == CIPH_FWD) ? aes_mvm(mask_i, A2X) :
(op_i == CIPH_INV) ? aes_mvm(mask_i, S2X) :
aes_mvm(mask_i, A2X);
// Do the inversion in normal basis X.
aes_dom_inverse_gf2p8 #(
.PipelineMul ( PipelineMul )
) u_aes_dom_inverse_gf2p8 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.we_i ( we ),
.a_y ( in_data_basis_x ), // input
.b_y ( in_mask_basis_x ), // input
.prd_i ( in_prd ), // input
.a_y_inv ( out_data_basis_x ), // output
.b_y_inv ( out_mask_basis_x ), // output
.prd_o ( out_prd ) // output
);
// Convert data to basis S or A.
assign data_o = (op_i == CIPH_FWD) ? (aes_mvm(out_data_basis_x, X2S) ^ 8'h63) :
(op_i == CIPH_INV) ? (aes_mvm(out_data_basis_x, X2A)) :
(aes_mvm(out_data_basis_x, X2S) ^ 8'h63);
// Convert mask to basis S or A.
// The addition of constant 8'h63 following the affine transformation is skipped.
assign mask_o = (op_i == CIPH_FWD) ? aes_mvm(out_mask_basis_x, X2S) :
(op_i == CIPH_INV) ? aes_mvm(out_mask_basis_x, X2A) :
aes_mvm(out_mask_basis_x, X2S);
// Counter register
logic [2:0] count_d, count_q;
assign count_d = (out_req_o && out_ack_i) ? '0 :
out_req_o ? count_q :
en_i ? count_q + 3'd1 : count_q;
always_ff @(posedge clk_i or negedge rst_ni) begin : reg_count
if (!rst_ni) begin
count_q <= '0;
end else begin
count_q <= count_d;
end
end
assign out_req_o = en_i & count_q == 3'd4;
// Write enable signals for internal registers
assign we[0] = en_i & count_q == 3'd0;
assign we[1] = en_i & count_q == 3'd1;
assign we[2] = en_i & count_q == 3'd2;
assign we[3] = en_i & count_q == 3'd3;
// Buffer and forward PRD for the individual stages. We get 8 bits from the PRNG for usage in the
// first cycle. Stages 2, 3 and 4 are driven by other S-Box instances.
assign prd1_d = we[0] ? prd_i[7:0] : prd1_q;
prim_flop #(
.Width ( 8 ),
.ResetValue ( '0 )
) u_prim_flop_prd1_q (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.d_i ( prd1_d ),
.q_o ( prd1_q )
);
assign in_prd = '{prd_1: prd1_d,
prd_2: prd_i[11:8],
prd_3: prd_i[19:12],
prd_4: prd_i[27:20]};
assign prd_o = {out_prd.prd_3, out_prd.prd_2, out_prd.prd_1};
endmodule