hw/ip/prim/rtl/prim_dom_and_2share.sv - 3p/lowrisc/opentitan - Git at Google

 // Copyright lowRISC contributors.
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
 // Domain-Oriented Masking GF(2) Multiplier with 2-shares
 // ref: Higher-Order Side-Channel Protected Implementations of Keccak
 //     https://eprint.iacr.org/2017/395.pdf
 //
 // q0 = a0 & b0 + (a0 & b1 + z)
 // q1 = a1 & b1 + (a1 & b0 + z)
 // () ==> registered
 //
 // all input should be stable for two clocks
 // as the output is valid after a clock
 // For z, it can use other slice from the state
 // as it is fairly random w.r.t the current inputs.

 // General formula of Q in the paper
 // Qi = t{i,i} + Sig(j>i,d)(t{i,j}+Z{i+j*(j-1)/2}) + Sig(j<i,d)(t{i,j}+Z{j+i*(i-1)/2})
 // for d=1 (NumShare 2 for first order protection)
 // Q0 = t{0,0} + Sig(j>0,1)(t{0,j}+Z{j(j-1)/2}) + Sig(j<0,d)(..)
 //    = a0&b0  + (a0&b1 + z0                    + 0)
 // Q1 = t{1,1} + sig(j>1,1)(...) + sig(j<1,1)(t{1,j} + Z{j})
 //    = a1&b1  + (0              + a1&b0 + z0)

 `include "prim_assert.sv"

 module prim_dom_and_2share #(
   parameter int DW = 64, // Input width
   parameter bit Pipeline = 1'b0 // Enable full pipelining
 ) (
   input clk_i,
   input rst_ni,

   input [DW-1:0] a0_i, // share0 of a
   input [DW-1:0] a1_i, // share1 of a
   input [DW-1:0] b0_i, // share0 of b
   input [DW-1:0] b1_i, // share1 of b
   input          z_valid_i, // random number input validity
   input [DW-1:0] z_i,  // random number

   output logic [DW-1:0] q0_o, // share0 of q
   output logic [DW-1:0] q1_o, // share1 of q
   output logic [DW-1:0] prd_o // pseudo-random data for other instances
 );

   logic [DW-1:0] t0_d, t0_q, t1_d, t1_q;
   logic [DW-1:0] t_a0b0, t_a1b1;
   logic [DW-1:0] t_a0b0_d, t_a1b1_d;
   logic [DW-1:0] t_a0b1, t_a1b0;

   /////////////////
   // Calculation //
   /////////////////
   // Inner-domain terms
   assign t_a0b0_d = a0_i & b0_i;
   assign t_a1b1_d = a1_i & b1_i;

   // Cross-domain terms
   assign t_a0b1 = a0_i & b1_i;
   assign t_a1b0 = a1_i & b0_i;

   ///////////////
   // Resharing //
   ///////////////
   // Resharing of cross-domain terms

   // Preserve the logic sequence for XOR not to proceed cross-domain AND.
   prim_xor2 #(
     .Width ( DW*2 )
   ) u_prim_xor_t01 (
     .in0_i ( {t_a0b1, t_a1b0} ),
     .in1_i ( {z_i,    z_i}    ),
     .out_o ( {t0_d,   t1_d}   )
   );

   // Register stage
   prim_flop_en #(
     .Width      ( DW*2 ),
     .ResetValue ( '0   )
   ) u_prim_flop_t01 (
     .clk_i  ( clk_i        ),
     .rst_ni ( rst_ni       ),
     .en_i   ( z_valid_i    ),
     .d_i    ( {t0_d, t1_d} ),
     .q_o    ( {t0_q, t1_q} )
   );

   /////////////////////////
   // Optional Pipelining //
   /////////////////////////

   if (Pipeline == 1'b1) begin : gen_inner_domain_regs
     // Add pipeline registers on inner-domain terms prior to integration. This allows accepting new
     // input data every clock cycle and prevents SCA leakage occurring due to the integration of
     // reshared cross-domain terms with inner-domain terms derived from different input data.

     logic [DW-1:0] t_a0b0_q, t_a1b1_q;
     prim_flop_en #(
       .Width      ( DW*2 ),
       .ResetValue ( '0   )
     ) u_prim_flop_tab01 (
       .clk_i  ( clk_i                ),
       .rst_ni ( rst_ni               ),
       .en_i   ( z_valid_i            ),
       .d_i    ( {t_a0b0_d, t_a1b1_d} ),
       .q_o    ( {t_a0b0_q, t_a1b1_q} )
     );

     assign t_a0b0 = t_a0b0_q;
     assign t_a1b1 = t_a1b1_q;

   end else begin : gen_no_inner_domain_regs
     // Do not add the optional pipeline registers on the inner-domain terms. This allows to save
     // some area in case the multiplier does not need to accept new data in every cycle. However,
     // this can cause SCA leakage as during the clock cycle in which new data arrives, the new
     // inner-domain terms are integrated with the previous, reshared cross-domain terms.

     assign t_a0b0 = t_a0b0_d;
     assign t_a1b1 = t_a1b1_d;
   end

   /////////////////
   // Integration //
   /////////////////

   // Preserve the logic sequence for XOR not to proceed the inner-domain AND.
   prim_xor2 #(
     .Width ( DW*2 )
   ) u_prim_xor_q01 (
     .in0_i ( {t_a0b0, t_a1b1} ),
     .in1_i ( {t0_q,   t1_q}   ),
     .out_o ( {q0_o,   q1_o}   )
   );

   // Use intermediate results for remasking computations in another instance in the following
   // clock cycle. Use one share only. Directly use output of flops updating with z_valid_i.
   // t1_q is obtained by remasking t_a1b0 with z_i. Since z_i is uniformly distributed and
   // independent of a1/b0_i, t1_q is also uniformly distributed and independent of a1/b0_i.
   // For details, see Lemma 1 in Canright, "A very compact 'perfectly masked' S-box for AES
   // (corrected)" available at https://eprint.iacr.org/2009/011.pdf
   assign prd_o = t1_q;

   // DOM AND should be same as unmasked computation
   // The correct test sequence will be:
   //   1. inputs are changed
   //   2. check if z_valid_i,
   //   3. at the next cycle, inputs are still stable (assumption) - only in case Pipeline = 0
   //   4. and results Q == A & B (assertion)

   // To speed up the FPV process, random value is ready in less than or
   // equal to two cycles.
   `ASSUME_FPV(RandomReadyInShortTime_A,
     $changed(a0_i) || $changed(a1_i) || $changed(b0_i) || $changed(b1_i)
       |-> ##[0:2] z_valid_i,
     clk_i, !rst_ni)

   if (Pipeline == 0) begin: g_assert_stable
     // If Pipeline is not set, the computation takes two cycles without flop
     // crossing the domain. In this case, the signal should be stable for at
     // least two cycles.
     `ASSUME(StableTwoCycles_M,
       ($changed(a0_i)  || $changed(a1_i) || $changed(b0_i) || $changed(b1_i))
         ##[0:$] z_valid_i |=>
         $stable(a0_i) && $stable(a1_i) && $stable(b0_i) && $stable(b1_i))
   end

   `ASSERT(UnmaskedAndMatched_A,
     z_valid_i |=> (q0_o ^ q1_o) ==
       (($past(a0_i) ^ $past(a1_i)) & ($past(b0_i) ^ $past(b1_i))),
     clk_i, !rst_ni)

 endmodule
	// Copyright lowRISC contributors.
	// Licensed under the Apache License, Version 2.0, see LICENSE for details.
	// SPDX-License-Identifier: Apache-2.0
	//
	// Domain-Oriented Masking GF(2) Multiplier with 2-shares
	// ref: Higher-Order Side-Channel Protected Implementations of Keccak
	// https://eprint.iacr.org/2017/395.pdf
	//
	// q0 = a0 & b0 + (a0 & b1 + z)
	// q1 = a1 & b1 + (a1 & b0 + z)
	// () ==> registered
	//
	// all input should be stable for two clocks
	// as the output is valid after a clock
	// For z, it can use other slice from the state
	// as it is fairly random w.r.t the current inputs.

	// General formula of Q in the paper
	// Qi = t{i,i} + Sig(j>i,d)(t{i,j}+Z{i+j(j-1)/2}) + Sig(j<i,d)(t{i,j}+Z{j+i(i-1)/2})
	// for d=1 (NumShare 2 for first order protection)
	// Q0 = t{0,0} + Sig(j>0,1)(t{0,j}+Z{j(j-1)/2}) + Sig(j<0,d)(..)
	// = a0&b0 + (a0&b1 + z0 + 0)
	// Q1 = t{1,1} + sig(j>1,1)(...) + sig(j<1,1)(t{1,j} + Z{j})
	// = a1&b1 + (0 + a1&b0 + z0)

	`include "prim_assert.sv"

	module prim_dom_and_2share #(
	parameter int DW = 64, // Input width
	parameter bit Pipeline = 1'b0 // Enable full pipelining
	) (
	input clk_i,
	input rst_ni,

	input [DW-1:0] a0_i, // share0 of a
	input [DW-1:0] a1_i, // share1 of a
	input [DW-1:0] b0_i, // share0 of b
	input [DW-1:0] b1_i, // share1 of b
	input z_valid_i, // random number input validity
	input [DW-1:0] z_i, // random number

	output logic [DW-1:0] q0_o, // share0 of q
	output logic [DW-1:0] q1_o, // share1 of q
	output logic [DW-1:0] prd_o // pseudo-random data for other instances
	);

	logic [DW-1:0] t0_d, t0_q, t1_d, t1_q;
	logic [DW-1:0] t_a0b0, t_a1b1;
	logic [DW-1:0] t_a0b0_d, t_a1b1_d;
	logic [DW-1:0] t_a0b1, t_a1b0;

	/////////////////
	// Calculation //
	/////////////////
	// Inner-domain terms
	assign t_a0b0_d = a0_i & b0_i;
	assign t_a1b1_d = a1_i & b1_i;

	// Cross-domain terms
	assign t_a0b1 = a0_i & b1_i;
	assign t_a1b0 = a1_i & b0_i;

	///////////////
	// Resharing //
	///////////////
	// Resharing of cross-domain terms

	// Preserve the logic sequence for XOR not to proceed cross-domain AND.
	prim_xor2 #(
	.Width ( DW*2 )
	) u_prim_xor_t01 (
	.in0_i ( {t_a0b1, t_a1b0} ),
	.in1_i ( {z_i, z_i} ),
	.out_o ( {t0_d, t1_d} )
	);

	// Register stage
	prim_flop_en #(
	.Width ( DW*2 ),
	.ResetValue ( '0 )
	) u_prim_flop_t01 (
	.clk_i ( clk_i ),
	.rst_ni ( rst_ni ),
	.en_i ( z_valid_i ),
	.d_i ( {t0_d, t1_d} ),
	.q_o ( {t0_q, t1_q} )
	);

	/////////////////////////
	// Optional Pipelining //
	/////////////////////////

	if (Pipeline == 1'b1) begin : gen_inner_domain_regs
	// Add pipeline registers on inner-domain terms prior to integration. This allows accepting new
	// input data every clock cycle and prevents SCA leakage occurring due to the integration of
	// reshared cross-domain terms with inner-domain terms derived from different input data.

	logic [DW-1:0] t_a0b0_q, t_a1b1_q;
	prim_flop_en #(
	.Width ( DW*2 ),
	.ResetValue ( '0 )
	) u_prim_flop_tab01 (
	.clk_i ( clk_i ),
	.rst_ni ( rst_ni ),
	.en_i ( z_valid_i ),
	.d_i ( {t_a0b0_d, t_a1b1_d} ),
	.q_o ( {t_a0b0_q, t_a1b1_q} )
	);

	assign t_a0b0 = t_a0b0_q;
	assign t_a1b1 = t_a1b1_q;

	end else begin : gen_no_inner_domain_regs
	// Do not add the optional pipeline registers on the inner-domain terms. This allows to save
	// some area in case the multiplier does not need to accept new data in every cycle. However,
	// this can cause SCA leakage as during the clock cycle in which new data arrives, the new
	// inner-domain terms are integrated with the previous, reshared cross-domain terms.

	assign t_a0b0 = t_a0b0_d;
	assign t_a1b1 = t_a1b1_d;
	end

	/////////////////
	// Integration //
	/////////////////

	// Preserve the logic sequence for XOR not to proceed the inner-domain AND.
	prim_xor2 #(
	.Width ( DW*2 )
	) u_prim_xor_q01 (
	.in0_i ( {t_a0b0, t_a1b1} ),
	.in1_i ( {t0_q, t1_q} ),
	.out_o ( {q0_o, q1_o} )
	);

	// Use intermediate results for remasking computations in another instance in the following
	// clock cycle. Use one share only. Directly use output of flops updating with z_valid_i.
	// t1_q is obtained by remasking t_a1b0 with z_i. Since z_i is uniformly distributed and
	// independent of a1/b0_i, t1_q is also uniformly distributed and independent of a1/b0_i.
	// For details, see Lemma 1 in Canright, "A very compact 'perfectly masked' S-box for AES
	// (corrected)" available at https://eprint.iacr.org/2009/011.pdf
	assign prd_o = t1_q;

	// DOM AND should be same as unmasked computation
	// The correct test sequence will be:
	// 1. inputs are changed
	// 2. check if z_valid_i,
	// 3. at the next cycle, inputs are still stable (assumption) - only in case Pipeline = 0
	// 4. and results Q == A & B (assertion)

	// To speed up the FPV process, random value is ready in less than or
	// equal to two cycles.
	`ASSUME_FPV(RandomReadyInShortTime_A,
	$changed(a0_i) \|\| $changed(a1_i) \|\| $changed(b0_i) \|\| $changed(b1_i)
	\|-> ##[0:2] z_valid_i,
	clk_i, !rst_ni)

	if (Pipeline == 0) begin: g_assert_stable
	// If Pipeline is not set, the computation takes two cycles without flop
	// crossing the domain. In this case, the signal should be stable for at
	// least two cycles.
	`ASSUME(StableTwoCycles_M,
	($changed(a0_i) \|\| $changed(a1_i) \|\| $changed(b0_i) \|\| $changed(b1_i))
	##[0:$] z_valid_i \|=>
	$stable(a0_i) && $stable(a1_i) && $stable(b0_i) && $stable(b1_i))
	end

	`ASSERT(UnmaskedAndMatched_A,
	z_valid_i \|=> (q0_o ^ q1_o) ==
	(($past(a0_i) ^ $past(a1_i)) & ($past(b0_i) ^ $past(b1_i))),
	clk_i, !rst_ni)

	endmodule