sw/otbn/crypto/rsa_verify_3072_rr.s - 3p/lowrisc/opentitan - Git at Google

 /* Copyright lowRISC contributors. */
 /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
 /* SPDX-License-Identifier: Apache-2.0 */

 /* The interface for this file can be accessed through the following symbols.
  * All of them are declared weak in this file, so can be overridden by code
  * that links against this object:
  *
  *   in_mod:   INPUT
  *             384 bytes
  *             The modulus
  *
  *   rr:    INPUT
  *             384 bytes
  *             The Montgomery transformation constant R^2 = (2^3072)^2 mod N
  */

 .text

 /**
  * Subtracts the modulus M from A.
  *
  * Flags: After this subroutine, the C flag is set to 1 if the subtraction
  * underflowed.
  *
  * This routine runs in variable time.
  * @param[in]  x16: dmem pointer to first limb of modulus M
  * @param[in]  [w4:w15]: operand A
  * @param[in]  w31: all-zero
  * @param[out] [w16:w27]: result
  *
  * clobbered registers: x8 to x12, w2, w3, w16 to w27
  * clobbered Flag Groups: FG0
  */
 subtract_modulus_var:

   /* Prepare temporary registers. */
   li        x8, 4
   li        x9, 2
   li        x10, 3
   li        x11, 16

   /* Copy pointer to modulus. */
   addi      x12, x16, 0

   /* Clear flags. */
   bn.add    w31, w31, w31

   /* Subtract M from A. */
   loopi     12, 4
     /* w2 <= A[i] */
     bn.movr   x9, x8++
     /* w3 <= M[i] */
     bn.lid    x10, 0(x12++)
     /* w2 <= w2 - w3 */
     bn.subb   w2, w2, w3
     /* out[i] <= w2 */
     bn.movr   x11++, x9

   ret

 /**
  * Doubles a number and reduces modulo M in-place.
  *
  *   Returns: C = (A + A) mod M
  *
  * Requires that A < M < 2^3072. Writes output to the A buffer in DMEM. This
  * routine runs in variable time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  x16: dmem pointer to first limb of modulus M
  * @param[in]  [w4:w15]: operand A
  * @param[in]  w31: all-zero
  * @param[out] [w4:w15]: result C
  *
  * clobbered registers: x2, x3, x7 to x12, w2 to w27
  * clobbered Flag Groups: FG0
  */
 double_mod_var:
   /* Save copy of pointer to modulus. */
   addi      x12, x16, 0

   /* Clear flags. */
   bn.add    w31, w31, w31

   /* Compute aa=(A + A).
        [w4:w15] <= (A+A) mod 2^3072 = aa[0:3071]*/
   li        x9, 2
   li        x10, 4
   loopi     12, 3
     /* w2 <= a[i] */
     bn.movr   x9, x10
     /* w2 <= w2 + w2 */
     bn.addc   w2, w2, w2
     /* aa[i] <= w2 */
     bn.movr   x10++, x9

   /* Extract final carry bit from flags register.
        x2 <= aa[3072] */
   csrrs     x2, 0x7c0, x0
   andi      x2, x2, 1

   jal       x1, subtract_modulus_var

   /* Extract final borrow bit from flags register. */
   csrrs     x3, 0x7c0, x0
   andi      x3, x3, 1

   /**
    * Select either aa or aa' based on carry/borrow bits.
    *
    * If aa < M, it follows that the carry bit aa[3072] = 0 (since M < 2^3072).
    * It also follows that the borrow from subtracting M must be 1. In this
    * case, select aa; otherwise, select aa-M.
    */

   /* x2 <= (!x2) & x3 */
   xori      x2, x2, 1
   and       x2, x2, x3

   /* Select aa if x2 = 0, otherwise aa-M. */
   bne       x2, x0, sel_aa

   /* Copy subtraction result to w4:w15. */
   li        x8, 4
   li        x11, 16
   loopi     12, 2
     bn.movr   x8, x11++
     addi      x8, x8, 1

 sel_aa:

   ret

 /**
  * Computes the R^2 Montgomery constant and stores it in DMEM.
  *
  *   Returns RR = (2^3072)^2 mod M
  *
  * A note on bounds: This computation assumes that 2^3071 <= M < 2^3072. This
  * agrees with FIPS 186-4 section B.3.1 (page 53), which states that the prime
  * factors of the RSA modulus must be at least sqrt(2)*2^(nlen/2-1) (where nlen
  * is the key length, 3072 in this case).
  *
  * The result is stored in dmem[rr]. This routine runs in variable time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  dmem[in_mod] pointer to first limb of modulus M in dmem
  * @param[in]  dmem[m0inv]  Montgomery constant (-(M^-1) mod 2^256)
  * @param[out] dmem[rr]     Montgomery constant (R^2) mod M
  *
  * clobbered registers: x0 to x3, x6 to x13, x16, x17, x19 to x22, x24,
  *                      w2 to w31
  * clobbered Flag Groups: FG0, FG1
  */
  .globl compute_rr
 compute_rr:
   /* Prepare all-zero register. */
   bn.xor    w31, w31, w31

   /* Set pointers to DMEM buffers. */
   la        x16, in_mod
   la        x17, m0inv
   la        x24, rr

   /* Zero [w4:w15]. */
   li        x9, 4
   li        x10, 31
   loopi     12, 1
     bn.movr   x9++, x10

   /* w16 <= 1 */
   bn.addi   w16, w31, 1

   /* [w4:w15] <= [w4:w16] >> 1 = 2^3701 */
   bn.rshi   w15, w16, w15 >> 1

   /* Compute T = (2^96 * R) mod M = 2^96 (montgomery form).
      T = [w4:w15] = (2^97 * 2^3071) mod M = (2^96 * R) mod M */
   loopi     97,2
     jal x1, double_mod_var
     nop

   /* Store T in output buffer (in preparation for montmul).
      dmem[rr] <= [w4:w15] = T */
   li        x8, 4
   addi      x21, x24, 0
   loopi     12, 2
     bn.sid    x8, 0(x21++)
     addi      x8, x8, 1

   /* Prepare pointers to temp regs for montmul. */
   li        x9, 3
   li        x10, 4
   li        x11, 2

   /* Prepare a pointer to the w4 register for storing the result. */
   li        x8, 4

   /* Five montgomery squares to compute RR = (T^(2^5) * R) mod M. */
   loopi     5,9
     /* [w4:w15] <= montmul(dmem[rr], dmem[rr]) */
     addi      x19, x24, 0
     addi      x20, x24, 0
     jal       x1, montmul
     /* Store result: dmem[rr] <= [w4:w15] */
     addi      x21, x24, 0
     addi      x22, x8, 0
     loopi     12, 2
       bn.sid    x22, 0(x21++)
       addi      x22, x22, 1
     nop

   ret

 /* Input buffer for the Montgomery constant m0_inv. */
 .section .data.m0inv
 .weak m0inv
 m0inv:
   .zero 32

 /* Input buffer for the modulus. */
 .section .bss.in_mod
 .weak in_mod
 in_mod:
   .zero 384

 /* Output buffer for the Montgomery transformation constant R^2. */
 .section .bss.rr
 .weak rr
 rr:
   .zero 384
	/* Copyright lowRISC contributors. */
	/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
	/* SPDX-License-Identifier: Apache-2.0 */

	/* The interface for this file can be accessed through the following symbols.
	* All of them are declared weak in this file, so can be overridden by code
	* that links against this object:
	*
	* in_mod: INPUT
	* 384 bytes
	* The modulus
	*
	* rr: INPUT
	* 384 bytes
	* The Montgomery transformation constant R^2 = (2^3072)^2 mod N
	*/

	.text

	/**
	* Subtracts the modulus M from A.
	*
	* Flags: After this subroutine, the C flag is set to 1 if the subtraction
	* underflowed.
	*
	* This routine runs in variable time.
	* @param[in] x16: dmem pointer to first limb of modulus M
	* @param[in] [w4:w15]: operand A
	* @param[in] w31: all-zero
	* @param[out] [w16:w27]: result
	*
	* clobbered registers: x8 to x12, w2, w3, w16 to w27
	* clobbered Flag Groups: FG0
	*/
	subtract_modulus_var:

	/* Prepare temporary registers. */
	li x8, 4
	li x9, 2
	li x10, 3
	li x11, 16

	/* Copy pointer to modulus. */
	addi x12, x16, 0

	/* Clear flags. */
	bn.add w31, w31, w31

	/* Subtract M from A. */
	loopi 12, 4
	/* w2 <= A[i] */
	bn.movr x9, x8++
	/* w3 <= M[i] */
	bn.lid x10, 0(x12++)
	/* w2 <= w2 - w3 */
	bn.subb w2, w2, w3
	/* out[i] <= w2 */
	bn.movr x11++, x9

	ret

	/**
	* Doubles a number and reduces modulo M in-place.
	*
	* Returns: C = (A + A) mod M
	*
	* Requires that A < M < 2^3072. Writes output to the A buffer in DMEM. This
	* routine runs in variable time.
	*
	* Flags: Flags have no meaning beyond the scope of this subroutine.
	*
	* @param[in] x16: dmem pointer to first limb of modulus M
	* @param[in] [w4:w15]: operand A
	* @param[in] w31: all-zero
	* @param[out] [w4:w15]: result C
	*
	* clobbered registers: x2, x3, x7 to x12, w2 to w27
	* clobbered Flag Groups: FG0
	*/
	double_mod_var:
	/* Save copy of pointer to modulus. */
	addi x12, x16, 0

	/* Clear flags. */
	bn.add w31, w31, w31

	/* Compute aa=(A + A).
	[w4:w15] <= (A+A) mod 2^3072 = aa[0:3071]*/
	li x9, 2
	li x10, 4
	loopi 12, 3
	/* w2 <= a[i] */
	bn.movr x9, x10
	/* w2 <= w2 + w2 */
	bn.addc w2, w2, w2
	/* aa[i] <= w2 */
	bn.movr x10++, x9

	/* Extract final carry bit from flags register.
	x2 <= aa[3072] */
	csrrs x2, 0x7c0, x0
	andi x2, x2, 1

	jal x1, subtract_modulus_var

	/* Extract final borrow bit from flags register. */
	csrrs x3, 0x7c0, x0
	andi x3, x3, 1

	/**
	* Select either aa or aa' based on carry/borrow bits.
	*
	* If aa < M, it follows that the carry bit aa[3072] = 0 (since M < 2^3072).
	* It also follows that the borrow from subtracting M must be 1. In this
	* case, select aa; otherwise, select aa-M.
	*/

	/* x2 <= (!x2) & x3 */
	xori x2, x2, 1
	and x2, x2, x3

	/* Select aa if x2 = 0, otherwise aa-M. */
	bne x2, x0, sel_aa

	/* Copy subtraction result to w4:w15. */
	li x8, 4
	li x11, 16
	loopi 12, 2
	bn.movr x8, x11++
	addi x8, x8, 1

	sel_aa:

	ret

	/**
	* Computes the R^2 Montgomery constant and stores it in DMEM.
	*
	* Returns RR = (2^3072)^2 mod M
	*
	* A note on bounds: This computation assumes that 2^3071 <= M < 2^3072. This
	* agrees with FIPS 186-4 section B.3.1 (page 53), which states that the prime
	* factors of the RSA modulus must be at least sqrt(2)*2^(nlen/2-1) (where nlen
	* is the key length, 3072 in this case).
	*
	* The result is stored in dmem[rr]. This routine runs in variable time.
	*
	* Flags: Flags have no meaning beyond the scope of this subroutine.
	*
	* @param[in] dmem[in_mod] pointer to first limb of modulus M in dmem
	* @param[in] dmem[m0inv] Montgomery constant (-(M^-1) mod 2^256)
	* @param[out] dmem[rr] Montgomery constant (R^2) mod M
	*
	* clobbered registers: x0 to x3, x6 to x13, x16, x17, x19 to x22, x24,
	* w2 to w31
	* clobbered Flag Groups: FG0, FG1
	*/
	.globl compute_rr
	compute_rr:
	/* Prepare all-zero register. */
	bn.xor w31, w31, w31

	/* Set pointers to DMEM buffers. */
	la x16, in_mod
	la x17, m0inv
	la x24, rr

	/* Zero [w4:w15]. */
	li x9, 4
	li x10, 31
	loopi 12, 1
	bn.movr x9++, x10

	/* w16 <= 1 */
	bn.addi w16, w31, 1

	/* [w4:w15] <= [w4:w16] >> 1 = 2^3701 */
	bn.rshi w15, w16, w15 >> 1

	/* Compute T = (2^96 * R) mod M = 2^96 (montgomery form).
	T = [w4:w15] = (2^97 * 2^3071) mod M = (2^96 * R) mod M */
	loopi 97,2
	jal x1, double_mod_var
	nop

	/* Store T in output buffer (in preparation for montmul).
	dmem[rr] <= [w4:w15] = T */
	li x8, 4
	addi x21, x24, 0
	loopi 12, 2
	bn.sid x8, 0(x21++)
	addi x8, x8, 1

	/* Prepare pointers to temp regs for montmul. */
	li x9, 3
	li x10, 4
	li x11, 2

	/* Prepare a pointer to the w4 register for storing the result. */
	li x8, 4

	/* Five montgomery squares to compute RR = (T^(2^5) * R) mod M. */
	loopi 5,9
	/* [w4:w15] <= montmul(dmem[rr], dmem[rr]) */
	addi x19, x24, 0
	addi x20, x24, 0
	jal x1, montmul
	/* Store result: dmem[rr] <= [w4:w15] */
	addi x21, x24, 0
	addi x22, x8, 0
	loopi 12, 2
	bn.sid x22, 0(x21++)
	addi x22, x22, 1
	nop

	ret

	/* Input buffer for the Montgomery constant m0_inv. */
	.section .data.m0inv
	.weak m0inv
	m0inv:
	.zero 32

	/* Input buffer for the modulus. */
	.section .bss.in_mod
	.weak in_mod
	in_mod:
	.zero 384

	/* Output buffer for the Montgomery transformation constant R^2. */
	.section .bss.rr
	.weak rr
	rr:
	.zero 384