blob: b45db9ae48afbdcecc2151c4f0d73db2c6c255ec [file] [log] [blame]
// Copyright lowRISC contributors.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
//
// Domain-Oriented Masking GF(2) Multiplier with 2-shares
// ref: Higher-Order Side-Channel Protected Implementations of Keccak
// https://eprint.iacr.org/2017/395.pdf
//
// q0 = a0 & b0 + (a0 & b1 + z)
// q1 = a1 & b1 + (a1 & b0 + z)
// () ==> registered
//
// all input should be stable for two clocks
// as the output is valid after a clock
// For z, it can use other slice from the state
// as it is fairly random w.r.t the current inputs.
// General formula of Q in the paper
// Qi = t{i,i} + Sig(j>i,d)(t{i,j}+Z{i+j*(j-1)/2}) + Sig(j<i,d)(t{i,j}+Z{j+i*(i-1)/2})
// for d=1 (NumShare 2 for first order protection)
// Q0 = t{0,0} + Sig(j>0,1)(t{0,j}+Z{j(j-1)/2}) + Sig(j<0,d)(..)
// = a0&b0 + (a0&b1 + z0 + 0)
// Q1 = t{1,1} + sig(j>1,1)(...) + sig(j<1,1)(t{1,j} + Z{j})
// = a1&b1 + (0 + a1&b0 + z0)
`include "prim_assert.sv"
module prim_dom_and_2share #(
parameter int DW = 64, // Input width
parameter bit Pipeline = 1'b0 // Enable full pipelining
) (
input clk_i,
input rst_ni,
input [DW-1:0] a0_i, // share0 of a
input [DW-1:0] a1_i, // share1 of a
input [DW-1:0] b0_i, // share0 of b
input [DW-1:0] b1_i, // share1 of b
input z_valid_i, // random number input validity
input [DW-1:0] z_i, // random number
output logic [DW-1:0] q0_o, // share0 of q
output logic [DW-1:0] q1_o, // share1 of q
output logic [DW-1:0] prd_o // pseudo-random data for other instances
);
logic [DW-1:0] t0_d, t0_q, t1_d, t1_q;
logic [DW-1:0] t_a0b0, t_a1b1;
logic [DW-1:0] t_a0b0_d, t_a1b1_d;
logic [DW-1:0] t_a0b1, t_a1b0;
/////////////////
// Calculation //
/////////////////
// Inner-domain terms
assign t_a0b0_d = a0_i & b0_i;
assign t_a1b1_d = a1_i & b1_i;
// Cross-domain terms
assign t_a0b1 = a0_i & b1_i;
assign t_a1b0 = a1_i & b0_i;
///////////////
// Resharing //
///////////////
// Resharing of cross-domain terms
// Preserve the logic sequence for XOR not to proceed cross-domain AND.
prim_xor2 #(
.Width ( DW*2 )
) u_prim_xor_t01 (
.in0_i ( {t_a0b1, t_a1b0} ),
.in1_i ( {z_i, z_i} ),
.out_o ( {t0_d, t1_d} )
);
// Register stage
prim_flop_en #(
.Width ( DW*2 ),
.ResetValue ( '0 )
) u_prim_flop_t01 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( z_valid_i ),
.d_i ( {t0_d, t1_d} ),
.q_o ( {t0_q, t1_q} )
);
/////////////////////////
// Optional Pipelining //
/////////////////////////
if (Pipeline == 1'b1) begin : gen_inner_domain_regs
// Add pipeline registers on inner-domain terms prior to integration. This allows accepting new
// input data every clock cycle and prevents SCA leakage occurring due to the integration of
// reshared cross-domain terms with inner-domain terms derived from different input data.
logic [DW-1:0] t_a0b0_q, t_a1b1_q;
prim_flop_en #(
.Width ( DW*2 ),
.ResetValue ( '0 )
) u_prim_flop_tab01 (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.en_i ( z_valid_i ),
.d_i ( {t_a0b0_d, t_a1b1_d} ),
.q_o ( {t_a0b0_q, t_a1b1_q} )
);
assign t_a0b0 = t_a0b0_q;
assign t_a1b1 = t_a1b1_q;
end else begin : gen_no_inner_domain_regs
// Do not add the optional pipeline registers on the inner-domain terms. This allows to save
// some area in case the multiplier does not need to accept new data in every cycle. However,
// this can cause SCA leakage as during the clock cycle in which new data arrives, the new
// inner-domain terms are integrated with the previous, reshared cross-domain terms.
assign t_a0b0 = t_a0b0_d;
assign t_a1b1 = t_a1b1_d;
end
/////////////////
// Integration //
/////////////////
// Preserve the logic sequence for XOR not to proceed the inner-domain AND.
prim_xor2 #(
.Width ( DW*2 )
) u_prim_xor_q01 (
.in0_i ( {t_a0b0, t_a1b1} ),
.in1_i ( {t0_q, t1_q} ),
.out_o ( {q0_o, q1_o} )
);
// Use intermediate results for remasking computations in another instance in the following
// clock cycle. Use one share only. Directly use output of flops updating with z_valid_i.
// t1_q is obtained by remasking t_a1b0 with z_i. Since z_i is uniformly distributed and
// independent of a1/b0_i, t1_q is also uniformly distributed and independent of a1/b0_i.
// For details, see Lemma 1 in Canright, "A very compact 'perfectly masked' S-box for AES
// (corrected)" available at https://eprint.iacr.org/2009/011.pdf
assign prd_o = t1_q;
// DOM AND should be same as unmasked computation
// The correct test sequence will be:
// 1. inputs are changed
// 2. check if z_valid_i,
// 3. at the next cycle, inputs are still stable (assumption) - only in case Pipeline = 0
// 4. and results Q == A & B (assertion)
// To speed up the FPV process, random value is ready in less than or
// equal to two cycles.
`ASSUME_FPV(RandomReadyInShortTime_A,
$changed(a0_i) || $changed(a1_i) || $changed(b0_i) || $changed(b1_i)
|-> ##[0:2] z_valid_i,
clk_i, !rst_ni)
if (Pipeline == 0) begin: g_assert_stable
// If Pipeline is not set, the computation takes two cycles without flop
// crossing the domain. In this case, the signal should be stable for at
// least two cycles.
`ASSUME(StableTwoCycles_M,
($changed(a0_i) || $changed(a1_i) || $changed(b0_i) || $changed(b1_i))
##[0:$] z_valid_i |=>
$stable(a0_i) && $stable(a1_i) && $stable(b0_i) && $stable(b1_i))
end
`ASSERT(UnmaskedAndMatched_A,
z_valid_i |=> (q0_o ^ q1_o) ==
(($past(a0_i) ^ $past(a1_i)) & ($past(b0_i) ^ $past(b1_i))),
clk_i, !rst_ni)
endmodule