sw/otbn/code-snippets/barrett384_p384.s - 3p/lowrisc/opentitan - Git at Google

 /* Copyright lowRISC contributors. */
 /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
 /* SPDX-License-Identifier: Apache-2.0 */
 /*
  *   Constant time 384 bit modular multiplication based on Barrett reduction
  *   algorithm.
  *
  *   Provides a modular multiplication kernel for 384 bit wide numbers
  *   for the modulus and barret constant of NIST curve P384 only.
  */

 .section .text

 /**
  * 384-bit modular multiplication based on Barrett reduction algorithm
  * optimized for the special modulus of the NIST P-384 curve.
  *
  * Returns c = a x b % p.
  *
  * Expects: two operands, modulus p and pre-calculated parameter u for barrett
  * reduction (usually greek mu in literature). u is expected without the
  * leading 1 at bit 384. u has to be pre-calculated as u = floor(2^768/p).
  *
  * This implementation mostly follows the description in the
  * "Handbook of Applied Cryptography" in Algorithm 14.42.
  * Differences:
  *   - This implementation incorporates a multiplication before the reduction.
  *     Therefore it expects two operands (a, b) instead of a wider integer x.
  *   - The computation of q2 ignores the MSbs of q1 and u to allow using
  *     a 384x384 bit multiplication. This is compensated later by
  *     individual (conditional) additions.
  *   - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
  *     are not implemented here and the full register width is used. This
  *     allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
  *
  * Flags: Flags when leaving this subroutine depend on a potentially discarded
  *        value and therefore are not usable after return.
  *
  * @param[in] [w11, w10]: a, first operand, max. length 384 bit, a < m.
  * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
  * @param[in] [w13, w12]: p, modulus of P384 i.e.:
                              m = 2^384 - 2^128 - 2^96 + 2^32 - 1.
  * @param[in] [w15, w14]: u, pre-computed Barrett constant (without u[384]/MSb
  *                           of u which is always 1 for the allowed range but
  *                           has to be set to 0 here).
  * @param[in] w31: all-zero.
  * @param[out] [w17, w16]: c, result, max. length 384 bit.
  *
  * Clobbered registers: w16, w17, w18, w19, w20, w21, w22, w23, w24
  * Clobbered flag groups: FG0
  */
  .globl barrett384_p384
 barrett384_p384:
   /* Compute the integer product of the operands x = a * b
      x = [w18, w22, w21] = a * b = [w11, w10] * [w17, w16]
      => max. length x: 768 bit */
   bn.mulqacc.z          w10.0, w16.0,   0
   bn.mulqacc            w10.0, w16.1,  64
   bn.mulqacc.so w21.L,  w10.1, w16.0,  64
   bn.mulqacc            w10.0, w16.2,   0
   bn.mulqacc            w10.1, w16.1,   0
   bn.mulqacc            w10.2, w16.0,   0
   bn.mulqacc            w10.0, w16.3,  64
   bn.mulqacc            w10.1, w16.2,  64
   bn.mulqacc            w10.2, w16.1,  64
   bn.mulqacc.so w21.U,  w10.3, w16.0,  64
   bn.mulqacc            w10.0, w17.0,   0
   bn.mulqacc            w10.1, w16.3,   0
   bn.mulqacc            w10.2, w16.2,   0
   bn.mulqacc            w10.3, w16.1,   0
   bn.mulqacc            w11.0, w16.0,   0
   bn.mulqacc            w10.0, w17.1,  64
   bn.mulqacc            w10.1, w17.0,  64
   bn.mulqacc            w10.2, w16.3,  64
   bn.mulqacc            w10.3, w16.2,  64
   bn.mulqacc            w11.0, w16.1,  64
   bn.mulqacc.so w22.L,  w11.1, w16.0,  64
   bn.mulqacc            w10.1, w17.1,   0
   bn.mulqacc            w10.2, w17.0,   0
   bn.mulqacc            w10.3, w16.3,   0
   bn.mulqacc            w11.0, w16.2,   0
   bn.mulqacc            w11.1, w16.1,   0
   bn.mulqacc            w10.2, w17.1,  64
   bn.mulqacc            w10.3, w17.0,  64
   bn.mulqacc            w11.0, w16.3,  64
   bn.mulqacc.so w22.U,  w11.1, w16.2,  64
   bn.mulqacc            w10.3, w17.1,   0
   bn.mulqacc            w11.0, w17.0,   0
   bn.mulqacc            w11.1, w16.3,   0
   bn.mulqacc            w11.0, w17.1,  64
   bn.mulqacc.so w18.L,  w11.1, w17.0,  64
   bn.mulqacc.so w18.U,  w11.1, w17.1,   0

   /* Store correction factor to compensate for later neglected MSb of x.
      x is 768 bit wide and therefore the 383 bit right shifted version q1
      (below) contains 385 bit. Bit 384 of q1 is neglected to allow using a
      384x384 multiplier. For the MSb of x being set we temporary store u
      (or zero) here to be used in a later constant time correction of a
      multiplication with u. Note that this requires the MSb flag being carried
      over from the multiplication routine. */
   bn.sel w23, w14, w31, M
   bn.sel w24, w15, w31, M

   /* Compute q1 = x >> 383
      q1 = [w11, w10] = [w18, w22, w21] >> 383  = [w18, w21] >> 127
      => max length q1: 385 bits */
   bn.rshi w11, w31, w18 >> 127
   bn.rshi w10, w18, w22 >> 127

   /* Compute q2 = q1*u
      Instead of full q2 (which would be up to 770 bits) we ignore the MSb of u
      and the MSb of q1 and correct this later. This allows using a 384x384
      multiplier. We use the property that u for the modulus of P384 is zero in
      the bits 383 downto 129 and use a 384x192 multiplication routine.
      => max. length q2': 513 bit
      q2' = q1[383:0]*u[128:0] = [w18, w17, w16] = [w11, w10] * [w15, w14] */

   /* 576 = 384*192 bit multiplication kernel */
   bn.mulqacc.z          w10.0, w14.0,   0
   bn.mulqacc            w10.0, w14.1,  64
   bn.mulqacc.so w16.L,  w10.1, w14.0,  64
   bn.mulqacc            w10.0, w14.2,   0
   bn.mulqacc            w10.1, w14.1,   0
   bn.mulqacc            w10.2, w14.0,   0
   bn.mulqacc            w10.1, w14.2,  64
   bn.mulqacc            w10.2, w14.1,  64
   bn.mulqacc.so w16.U,  w10.3, w14.0,  64
   bn.mulqacc            w10.2, w14.2,   0
   bn.mulqacc            w10.3, w14.1,   0
   bn.mulqacc            w11.0, w14.0,   0
   bn.mulqacc            w10.3, w14.2,  64
   bn.mulqacc            w11.0, w14.1,  64
   bn.mulqacc.so w17.L,  w11.1, w14.0,  64
   bn.mulqacc            w11.0, w14.2,   0
   bn.mulqacc            w11.1, w14.1,   0
   bn.mulqacc.so w17.U,  w11.1, w14.2,  64

   /* w14.3 is always zero here due to structure of Barrett constant */
   bn.mulqacc.wo w18,    w11.1, w14.3,  64

   /* q3 = q2 >> 385
      In this step, the compensation for the neglected MSbs of q1 and u is
      carried out underway. To add them in the q2 domain, they would have to be
      left shifted by 384 bit first. To directly add them we first shift q2' by
      384 bit to the right, perform the additions, and shift the result another
      bit to the right. The additions cannot overflow due to leading zeros
      after shift.
      q2'' = q2' >> 384 = [w20, w19] = [w18, w17, w16] >> 384
                                     = [w18, w17] >> 128 */
   bn.rshi w20, w31, w18 >> 128
   bn.rshi w19, w18, w17 >> 128
   /* Add q1. This is unconditional since MSb of u is always 1.
      This cannot overflow due to leading zeros.
      q2''' = q2'' + q1 = [w20, w19] = [w20, w19] + [w10, w11] */
   bn.add w19, w19, w10
   bn.addc w20, w20, w11
   /* Conditionally add u (without leading 1) in case of MSb of x being set.
      This is the "real" q2 but shifted by 384 bits to the right. This cannot
      overflow due to leading zeros
      q2'''' = x[767]?q2'''+u[383:0]:q2'''
             = [w20, w19] + [w24, w23] = q2 >> 384 */
   bn.add w19, w19, w23
   bn.addc w20, w20, w24
   /* finally this gives q3 by shifting the remaining bit to the right
      q3 = q2 >> 385 = q2'''' >> 1 = [w11, w10] = [w20, w19] >> 1 */
   bn.rshi w11, w31, w20 >> 1
   bn.rshi w10, w20, w19 >> 1

   /* r2 = q3*m[511:0] = [w17, w16] = ([w11, w10] * [w13, w12])[511:0]
      A 384x384 bit multiplication kernel is used here, hence both q3 or p
      must not be wider than 384 bit. This is always the case for p. For q3 it
      is the case if a<p and b<p.
      The 256 highest bits of the multiplication result are not needed,
      so we do not compute them. */
   bn.mulqacc.z          w10.0, w12.0,   0
   bn.mulqacc            w10.0, w12.1,  64
   bn.mulqacc.so w16.L,  w10.1, w12.0,  64
   bn.mulqacc            w10.0, w12.2,   0
   bn.mulqacc            w10.1, w12.1,   0
   bn.mulqacc            w10.2, w12.0,   0
   bn.mulqacc            w10.0, w12.3,  64
   bn.mulqacc            w10.1, w12.2,  64
   bn.mulqacc            w10.2, w12.1,  64
   bn.mulqacc.so w16.U,  w10.3, w12.0,  64
   bn.mulqacc            w10.0, w13.0,   0
   bn.mulqacc            w10.1, w12.3,   0
   bn.mulqacc            w10.2, w12.2,   0
   bn.mulqacc            w10.3, w12.1,   0
   bn.mulqacc            w11.0, w12.0,   0
   bn.mulqacc            w10.0, w13.1,  64
   bn.mulqacc            w10.1, w13.0,  64
   bn.mulqacc            w10.2, w12.3,  64
   bn.mulqacc            w10.3, w12.2,  64
   bn.mulqacc            w11.0, w12.1,  64
   bn.mulqacc.so w17.L,  w11.1, w12.0,  64
   bn.mulqacc            w10.1, w13.1,   0
   bn.mulqacc            w10.2, w13.0,   0
   bn.mulqacc            w10.3, w12.3,   0
   bn.mulqacc            w11.0, w12.2,   0
   bn.mulqacc            w11.1, w12.1,   0
   bn.mulqacc            w10.2, w13.1,  64
   bn.mulqacc            w10.3, w13.0,  64
   bn.mulqacc            w11.0, w12.3,  64
   bn.mulqacc.so w17.U,  w11.1, w12.2,  64

   /* Compute r = x-r2 = x-q3*p
      since 0 <= r < 3*p, we only need to consider the lower limbs of x and r2
      r[511:0] = [w22, w21] - [w17, w16] */
   bn.sub w21, w21, w16
   bn.subb w22, w22, w17

   /* Barrett algorithm requires subtraction of the modulus at most two times if
      result is too large. However in the special case of P-384 we need to
      subtract only once */
   bn.sub w16, w21, w12
   bn.subb w17, w22, w13
   bn.sel w16, w21, w16, C
   bn.sel w17, w22, w17, C

   /* return result: c =[w17, w16] =  a * b % p. */
   ret
	/* Copyright lowRISC contributors. */
	/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
	/* SPDX-License-Identifier: Apache-2.0 */
	/*
	* Constant time 384 bit modular multiplication based on Barrett reduction
	* algorithm.
	*
	* Provides a modular multiplication kernel for 384 bit wide numbers
	* for the modulus and barret constant of NIST curve P384 only.
	*/

	.section .text

	/**
	* 384-bit modular multiplication based on Barrett reduction algorithm
	* optimized for the special modulus of the NIST P-384 curve.
	*
	* Returns c = a x b % p.
	*
	* Expects: two operands, modulus p and pre-calculated parameter u for barrett
	* reduction (usually greek mu in literature). u is expected without the
	* leading 1 at bit 384. u has to be pre-calculated as u = floor(2^768/p).
	*
	* This implementation mostly follows the description in the
	* "Handbook of Applied Cryptography" in Algorithm 14.42.
	* Differences:
	* - This implementation incorporates a multiplication before the reduction.
	* Therefore it expects two operands (a, b) instead of a wider integer x.
	* - The computation of q2 ignores the MSbs of q1 and u to allow using
	* a 384x384 bit multiplication. This is compensated later by
	* individual (conditional) additions.
	* - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
	* are not implemented here and the full register width is used. This
	* allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
	*
	* Flags: Flags when leaving this subroutine depend on a potentially discarded
	* value and therefore are not usable after return.
	*
	* @param[in] [w11, w10]: a, first operand, max. length 384 bit, a < m.
	* @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
	* @param[in] [w13, w12]: p, modulus of P384 i.e.:
	m = 2^384 - 2^128 - 2^96 + 2^32 - 1.
	* @param[in] [w15, w14]: u, pre-computed Barrett constant (without u[384]/MSb
	* of u which is always 1 for the allowed range but
	* has to be set to 0 here).
	* @param[in] w31: all-zero.
	* @param[out] [w17, w16]: c, result, max. length 384 bit.
	*
	* Clobbered registers: w16, w17, w18, w19, w20, w21, w22, w23, w24
	* Clobbered flag groups: FG0
	*/
	.globl barrett384_p384
	barrett384_p384:
	/* Compute the integer product of the operands x = a * b
	x = [w18, w22, w21] = a * b = [w11, w10] * [w17, w16]
	=> max. length x: 768 bit */
	bn.mulqacc.z w10.0, w16.0, 0
	bn.mulqacc w10.0, w16.1, 64
	bn.mulqacc.so w21.L, w10.1, w16.0, 64
	bn.mulqacc w10.0, w16.2, 0
	bn.mulqacc w10.1, w16.1, 0
	bn.mulqacc w10.2, w16.0, 0
	bn.mulqacc w10.0, w16.3, 64
	bn.mulqacc w10.1, w16.2, 64
	bn.mulqacc w10.2, w16.1, 64
	bn.mulqacc.so w21.U, w10.3, w16.0, 64
	bn.mulqacc w10.0, w17.0, 0
	bn.mulqacc w10.1, w16.3, 0
	bn.mulqacc w10.2, w16.2, 0
	bn.mulqacc w10.3, w16.1, 0
	bn.mulqacc w11.0, w16.0, 0
	bn.mulqacc w10.0, w17.1, 64
	bn.mulqacc w10.1, w17.0, 64
	bn.mulqacc w10.2, w16.3, 64
	bn.mulqacc w10.3, w16.2, 64
	bn.mulqacc w11.0, w16.1, 64
	bn.mulqacc.so w22.L, w11.1, w16.0, 64
	bn.mulqacc w10.1, w17.1, 0
	bn.mulqacc w10.2, w17.0, 0
	bn.mulqacc w10.3, w16.3, 0
	bn.mulqacc w11.0, w16.2, 0
	bn.mulqacc w11.1, w16.1, 0
	bn.mulqacc w10.2, w17.1, 64
	bn.mulqacc w10.3, w17.0, 64
	bn.mulqacc w11.0, w16.3, 64
	bn.mulqacc.so w22.U, w11.1, w16.2, 64
	bn.mulqacc w10.3, w17.1, 0
	bn.mulqacc w11.0, w17.0, 0
	bn.mulqacc w11.1, w16.3, 0
	bn.mulqacc w11.0, w17.1, 64
	bn.mulqacc.so w18.L, w11.1, w17.0, 64
	bn.mulqacc.so w18.U, w11.1, w17.1, 0

	/* Store correction factor to compensate for later neglected MSb of x.
	x is 768 bit wide and therefore the 383 bit right shifted version q1
	(below) contains 385 bit. Bit 384 of q1 is neglected to allow using a
	384x384 multiplier. For the MSb of x being set we temporary store u
	(or zero) here to be used in a later constant time correction of a
	multiplication with u. Note that this requires the MSb flag being carried
	over from the multiplication routine. */
	bn.sel w23, w14, w31, M
	bn.sel w24, w15, w31, M

	/* Compute q1 = x >> 383
	q1 = [w11, w10] = [w18, w22, w21] >> 383 = [w18, w21] >> 127
	=> max length q1: 385 bits */
	bn.rshi w11, w31, w18 >> 127
	bn.rshi w10, w18, w22 >> 127

	/* Compute q2 = q1*u
	Instead of full q2 (which would be up to 770 bits) we ignore the MSb of u
	and the MSb of q1 and correct this later. This allows using a 384x384
	multiplier. We use the property that u for the modulus of P384 is zero in
	the bits 383 downto 129 and use a 384x192 multiplication routine.
	=> max. length q2': 513 bit
	q2' = q1[383:0]u[128:0] = [w18, w17, w16] = [w11, w10] [w15, w14] */

	/* 576 = 384192 bit multiplication kernel /
	bn.mulqacc.z w10.0, w14.0, 0
	bn.mulqacc w10.0, w14.1, 64
	bn.mulqacc.so w16.L, w10.1, w14.0, 64
	bn.mulqacc w10.0, w14.2, 0
	bn.mulqacc w10.1, w14.1, 0
	bn.mulqacc w10.2, w14.0, 0
	bn.mulqacc w10.1, w14.2, 64
	bn.mulqacc w10.2, w14.1, 64
	bn.mulqacc.so w16.U, w10.3, w14.0, 64
	bn.mulqacc w10.2, w14.2, 0
	bn.mulqacc w10.3, w14.1, 0
	bn.mulqacc w11.0, w14.0, 0
	bn.mulqacc w10.3, w14.2, 64
	bn.mulqacc w11.0, w14.1, 64
	bn.mulqacc.so w17.L, w11.1, w14.0, 64
	bn.mulqacc w11.0, w14.2, 0
	bn.mulqacc w11.1, w14.1, 0
	bn.mulqacc.so w17.U, w11.1, w14.2, 64

	/* w14.3 is always zero here due to structure of Barrett constant */
	bn.mulqacc.wo w18, w11.1, w14.3, 64

	/* q3 = q2 >> 385
	In this step, the compensation for the neglected MSbs of q1 and u is
	carried out underway. To add them in the q2 domain, they would have to be
	left shifted by 384 bit first. To directly add them we first shift q2' by
	384 bit to the right, perform the additions, and shift the result another
	bit to the right. The additions cannot overflow due to leading zeros
	after shift.
	q2'' = q2' >> 384 = [w20, w19] = [w18, w17, w16] >> 384
	= [w18, w17] >> 128 */
	bn.rshi w20, w31, w18 >> 128
	bn.rshi w19, w18, w17 >> 128
	/* Add q1. This is unconditional since MSb of u is always 1.
	This cannot overflow due to leading zeros.
	q2''' = q2'' + q1 = [w20, w19] = [w20, w19] + [w10, w11] */
	bn.add w19, w19, w10
	bn.addc w20, w20, w11
	/* Conditionally add u (without leading 1) in case of MSb of x being set.
	This is the "real" q2 but shifted by 384 bits to the right. This cannot
	overflow due to leading zeros
	q2'''' = x[767]?q2'''+u[383:0]:q2'''
	= [w20, w19] + [w24, w23] = q2 >> 384 */
	bn.add w19, w19, w23
	bn.addc w20, w20, w24
	/* finally this gives q3 by shifting the remaining bit to the right
	q3 = q2 >> 385 = q2'''' >> 1 = [w11, w10] = [w20, w19] >> 1 */
	bn.rshi w11, w31, w20 >> 1
	bn.rshi w10, w20, w19 >> 1

	/* r2 = q3m[511:0] = [w17, w16] = ([w11, w10] [w13, w12])[511:0]
	A 384x384 bit multiplication kernel is used here, hence both q3 or p
	must not be wider than 384 bit. This is always the case for p. For q3 it
	is the case if a<p and b<p.
	The 256 highest bits of the multiplication result are not needed,
	so we do not compute them. */
	bn.mulqacc.z w10.0, w12.0, 0
	bn.mulqacc w10.0, w12.1, 64
	bn.mulqacc.so w16.L, w10.1, w12.0, 64
	bn.mulqacc w10.0, w12.2, 0
	bn.mulqacc w10.1, w12.1, 0
	bn.mulqacc w10.2, w12.0, 0
	bn.mulqacc w10.0, w12.3, 64
	bn.mulqacc w10.1, w12.2, 64
	bn.mulqacc w10.2, w12.1, 64
	bn.mulqacc.so w16.U, w10.3, w12.0, 64
	bn.mulqacc w10.0, w13.0, 0
	bn.mulqacc w10.1, w12.3, 0
	bn.mulqacc w10.2, w12.2, 0
	bn.mulqacc w10.3, w12.1, 0
	bn.mulqacc w11.0, w12.0, 0
	bn.mulqacc w10.0, w13.1, 64
	bn.mulqacc w10.1, w13.0, 64
	bn.mulqacc w10.2, w12.3, 64
	bn.mulqacc w10.3, w12.2, 64
	bn.mulqacc w11.0, w12.1, 64
	bn.mulqacc.so w17.L, w11.1, w12.0, 64
	bn.mulqacc w10.1, w13.1, 0
	bn.mulqacc w10.2, w13.0, 0
	bn.mulqacc w10.3, w12.3, 0
	bn.mulqacc w11.0, w12.2, 0
	bn.mulqacc w11.1, w12.1, 0
	bn.mulqacc w10.2, w13.1, 64
	bn.mulqacc w10.3, w13.0, 64
	bn.mulqacc w11.0, w12.3, 64
	bn.mulqacc.so w17.U, w11.1, w12.2, 64

	/* Compute r = x-r2 = x-q3*p
	since 0 <= r < 3*p, we only need to consider the lower limbs of x and r2
	r[511:0] = [w22, w21] - [w17, w16] */
	bn.sub w21, w21, w16
	bn.subb w22, w22, w17

	/* Barrett algorithm requires subtraction of the modulus at most two times if
	result is too large. However in the special case of P-384 we need to
	subtract only once */
	bn.sub w16, w21, w12
	bn.subb w17, w22, w13
	bn.sel w16, w21, w16, C
	bn.sel w17, w22, w17, C

	/* return result: c =[w17, w16] = a * b % p. */
	ret