sw/otbn/crypto/rsa_verify_3072_rr.s - 3p/lowrisc/opentitan - Git at Google

 /* Copyright lowRISC contributors. */
 /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
 /* SPDX-License-Identifier: Apache-2.0 */

 /* The interface for this file can be accessed through the following symbols.
  * All of them are declared weak in this file, so can be overridden by code
  * that links against this object:
  *
  *   in_mod:   INPUT
  *             384 bytes
  *             The modulus
  *
  *   rr:    INPUT
  *             384 bytes
  *             The Montgomery transformation constant R^2 = (2^3072)^2 mod N
  */

 .text

 /**
  * Subtracts the modulus M from A.
  *
  * Flags: After this subroutine, the C flag is set to 1 if the subtraction
  * underflowed.
  *
  * This routine runs in variable time.
  * @param[in]  x16: dmem pointer to first limb of modulus M
  * @param[in]  [w4:w15]: operand A
  * @param[in]  w31: all-zero
  * @param[out] [w16:w27]: result
  *
  * clobbered registers: x8 to x12, w2, w3, w16 to w27
  * clobbered Flag Groups: FG0
  */
 subtract_modulus_var:

   /* Prepare temporary registers. */
   li        x8, 4
   li        x9, 2
   li        x10, 3
   li        x11, 16

   /* Copy pointer to modulus. */
   addi      x12, x16, 0

   /* Clear flags. */
   bn.add    w31, w31, w31

   /* Subtract M from A. */
   loopi     12, 4
     /* w2 <= A[i] */
     bn.movr   x9, x8++
     /* w3 <= M[i] */
     bn.lid    x10, 0(x12++)
     /* w2 <= w2 - w3 */
     bn.subb   w2, w2, w3
     /* out[i] <= w2 */
     bn.movr   x11++, x9

   ret

 /**
  * Doubles a number and reduces modulo M in-place.
  *
  *   Returns: C = (A + A) mod M
  *
  * Requires that A < M < 2^3072. Writes output to the A buffer in DMEM. This
  * routine runs in variable time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  x16: dmem pointer to first limb of modulus M
  * @param[in]  [w4:w15]: operand A
  * @param[in]  w31: all-zero
  * @param[out] [w4:w15]: result C
  *
  * clobbered registers: x2, x3, x7 to x12, w2 to w27
  * clobbered Flag Groups: FG0
  */
 double_mod_var:
   /* Save copy of pointer to modulus. */
   addi      x12, x16, 0

   /* Clear flags. */
   bn.add    w31, w31, w31

   /* Compute aa=(A + A).
        [w4:w15] <= (A+A) mod 2^3072 = aa[0:3071]*/
   li        x9, 2
   li        x10, 4
   loopi     12, 3
     /* w2 <= a[i] */
     bn.movr   x9, x10
     /* w2 <= w2 + w2 */
     bn.addc   w2, w2, w2
     /* aa[i] <= w2 */
     bn.movr   x10++, x9

   /* Extract final carry bit from flags register.
        x2 <= aa[3072] */
   csrrs     x2, 0x7c0, x0
   andi      x2, x2, 1

   jal       x1, subtract_modulus_var

   /* Extract final borrow bit from flags register. */
   csrrs     x3, 0x7c0, x0
   andi      x3, x3, 1

   /**
    * Select either aa or aa' based on carry/borrow bits.
    *
    * If aa < M, it follows that the carry bit aa[3072] = 0 (since M < 2^3072).
    * It also follows that the borrow from subtracting M must be 1. In this
    * case, select aa; otherwise, select aa-M.
    */

   /* x2 <= (!x2) & x3 */
   xori      x2, x2, 1
   and       x2, x2, x3

   /* Select aa if x2 = 0, otherwise aa-M. */
   bne       x2, x0, sel_aa

   /* Copy subtraction result to w4:w15. */
   li        x8, 4
   li        x11, 16
   loopi     12, 2
     bn.movr   x8, x11++
     addi      x8, x8, 1

 sel_aa:

   ret

 /**
  * Computes the R^2 Montgomery constant and stores it in DMEM.
  *
  *   Returns RR = (2^3072)^2 mod M
  *
  * This implementation is based on section 2.3 of "Montgomery Arithmetic from a
  * Software Perspective" (https://eprint.iacr.org/2017/1057). For the purposes
  * of RSA 3072, the parameters w and n from the paper are fixed (w=256, n=12).
  *
  * A note on bounds: the algorithm from the paper states an assumption that
  * 2^wn-1 <= M < 2^wn (in our case, 2^3071 <= M < 2^3072). We make that
  * assumption here too, because it agrees with FIPS 186-4 section B.3.1 (page
  * 53), which states that the prime factors of the RSA modulus must be at least
  * sqrt(2)*2^(nlen/2-1) (where nlen is the key length, 3072 in this case).
  *
  * The result is stored in dmem[rr]. This routine runs in variable time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  dmem[in_mod] pointer to first limb of modulus M in dmem
  *
  * clobbered registers: x9, x10, x16, w4 to w16, w31
  * clobbered Flag Groups: FG0
  */
  .globl compute_rr
 compute_rr:
   /* Prepare all-zero register. */
   bn.xor    w31, w31, w31

   /* Set pointer to modulus. */
   la        x16, in_mod

   /* Zero [w4:w15]. */
   li        x9, 4
   li        x10, 31
   loopi     12, 1
     bn.movr   x9++, x10

   /* w16 <= 1 */
   bn.addi   w16, w31, 1

   /* Initialize c0.
      c0 = [w4:w15] <= [w4:w16] >> 1 = 2^3701 */
   bn.rshi   w15, w16, w15 >> 1

   /* One modular doubling to get c1 \equiv 2^3072 mod M.
      c1 = [w4:w15] <= ([w4:w15] * 2) mod M = (2^3072) mod M */
   jal     x1, double_mod_var

   /* Compute (2^3072)^2 mod M by performing 3072 modular doublings. */
   li      x9, 3072
   loop     x9, 4
     jal     x1, double_mod_var
     /* Nop because loop can't end on a jump instruction. */
     nop

   /* Store result in dmem[rr]. */
   li        x9, 4
   la        x10, rr
   loopi     12, 2
     bn.sid    x9, 0(x10++)
     addi      x9, x9, 1

   ret

 /* Input buffer for the modulus. */
 .section .data.in_mod
 .weak in_mod
 in_mod:
   .zero 384

 /* Output buffer for the Montgomery transformation constant R^2. */
 .section .data.rr
 .weak rr
 rr:
   .zero 384
	/* Copyright lowRISC contributors. */
	/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
	/* SPDX-License-Identifier: Apache-2.0 */

	/* The interface for this file can be accessed through the following symbols.
	* All of them are declared weak in this file, so can be overridden by code
	* that links against this object:
	*
	* in_mod: INPUT
	* 384 bytes
	* The modulus
	*
	* rr: INPUT
	* 384 bytes
	* The Montgomery transformation constant R^2 = (2^3072)^2 mod N
	*/

	.text

	/**
	* Subtracts the modulus M from A.
	*
	* Flags: After this subroutine, the C flag is set to 1 if the subtraction
	* underflowed.
	*
	* This routine runs in variable time.
	* @param[in] x16: dmem pointer to first limb of modulus M
	* @param[in] [w4:w15]: operand A
	* @param[in] w31: all-zero
	* @param[out] [w16:w27]: result
	*
	* clobbered registers: x8 to x12, w2, w3, w16 to w27
	* clobbered Flag Groups: FG0
	*/
	subtract_modulus_var:

	/* Prepare temporary registers. */
	li x8, 4
	li x9, 2
	li x10, 3
	li x11, 16

	/* Copy pointer to modulus. */
	addi x12, x16, 0

	/* Clear flags. */
	bn.add w31, w31, w31

	/* Subtract M from A. */
	loopi 12, 4
	/* w2 <= A[i] */
	bn.movr x9, x8++
	/* w3 <= M[i] */
	bn.lid x10, 0(x12++)
	/* w2 <= w2 - w3 */
	bn.subb w2, w2, w3
	/* out[i] <= w2 */
	bn.movr x11++, x9

	ret

	/**
	* Doubles a number and reduces modulo M in-place.
	*
	* Returns: C = (A + A) mod M
	*
	* Requires that A < M < 2^3072. Writes output to the A buffer in DMEM. This
	* routine runs in variable time.
	*
	* Flags: Flags have no meaning beyond the scope of this subroutine.
	*
	* @param[in] x16: dmem pointer to first limb of modulus M
	* @param[in] [w4:w15]: operand A
	* @param[in] w31: all-zero
	* @param[out] [w4:w15]: result C
	*
	* clobbered registers: x2, x3, x7 to x12, w2 to w27
	* clobbered Flag Groups: FG0
	*/
	double_mod_var:
	/* Save copy of pointer to modulus. */
	addi x12, x16, 0

	/* Clear flags. */
	bn.add w31, w31, w31

	/* Compute aa=(A + A).
	[w4:w15] <= (A+A) mod 2^3072 = aa[0:3071]*/
	li x9, 2
	li x10, 4
	loopi 12, 3
	/* w2 <= a[i] */
	bn.movr x9, x10
	/* w2 <= w2 + w2 */
	bn.addc w2, w2, w2
	/* aa[i] <= w2 */
	bn.movr x10++, x9

	/* Extract final carry bit from flags register.
	x2 <= aa[3072] */
	csrrs x2, 0x7c0, x0
	andi x2, x2, 1

	jal x1, subtract_modulus_var

	/* Extract final borrow bit from flags register. */
	csrrs x3, 0x7c0, x0
	andi x3, x3, 1

	/**
	* Select either aa or aa' based on carry/borrow bits.
	*
	* If aa < M, it follows that the carry bit aa[3072] = 0 (since M < 2^3072).
	* It also follows that the borrow from subtracting M must be 1. In this
	* case, select aa; otherwise, select aa-M.
	*/

	/* x2 <= (!x2) & x3 */
	xori x2, x2, 1
	and x2, x2, x3

	/* Select aa if x2 = 0, otherwise aa-M. */
	bne x2, x0, sel_aa

	/* Copy subtraction result to w4:w15. */
	li x8, 4
	li x11, 16
	loopi 12, 2
	bn.movr x8, x11++
	addi x8, x8, 1

	sel_aa:

	ret

	/**
	* Computes the R^2 Montgomery constant and stores it in DMEM.
	*
	* Returns RR = (2^3072)^2 mod M
	*
	* This implementation is based on section 2.3 of "Montgomery Arithmetic from a
	* Software Perspective" (https://eprint.iacr.org/2017/1057). For the purposes
	* of RSA 3072, the parameters w and n from the paper are fixed (w=256, n=12).
	*
	* A note on bounds: the algorithm from the paper states an assumption that
	* 2^wn-1 <= M < 2^wn (in our case, 2^3071 <= M < 2^3072). We make that
	* assumption here too, because it agrees with FIPS 186-4 section B.3.1 (page
	* 53), which states that the prime factors of the RSA modulus must be at least
	* sqrt(2)*2^(nlen/2-1) (where nlen is the key length, 3072 in this case).
	*
	* The result is stored in dmem[rr]. This routine runs in variable time.
	*
	* Flags: Flags have no meaning beyond the scope of this subroutine.
	*
	* @param[in] dmem[in_mod] pointer to first limb of modulus M in dmem
	*
	* clobbered registers: x9, x10, x16, w4 to w16, w31
	* clobbered Flag Groups: FG0
	*/
	.globl compute_rr
	compute_rr:
	/* Prepare all-zero register. */
	bn.xor w31, w31, w31

	/* Set pointer to modulus. */
	la x16, in_mod

	/* Zero [w4:w15]. */
	li x9, 4
	li x10, 31
	loopi 12, 1
	bn.movr x9++, x10

	/* w16 <= 1 */
	bn.addi w16, w31, 1

	/* Initialize c0.
	c0 = [w4:w15] <= [w4:w16] >> 1 = 2^3701 */
	bn.rshi w15, w16, w15 >> 1

	/* One modular doubling to get c1 \equiv 2^3072 mod M.
	c1 = [w4:w15] <= ([w4:w15] * 2) mod M = (2^3072) mod M */
	jal x1, double_mod_var

	/* Compute (2^3072)^2 mod M by performing 3072 modular doublings. */
	li x9, 3072
	loop x9, 4
	jal x1, double_mod_var
	/* Nop because loop can't end on a jump instruction. */
	nop

	/* Store result in dmem[rr]. */
	li x9, 4
	la x10, rr
	loopi 12, 2
	bn.sid x9, 0(x10++)
	addi x9, x9, 1

	ret

	/* Input buffer for the modulus. */
	.section .data.in_mod
	.weak in_mod
	in_mod:
	.zero 384

	/* Output buffer for the Montgomery transformation constant R^2. */
	.section .data.rr
	.weak rr
	rr:
	.zero 384