| /* Copyright lowRISC contributors. */ | 
 | /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ | 
 | /* SPDX-License-Identifier: Apache-2.0 */ | 
 |  | 
 | /* The interface for this file can be accessed through the following symbols. | 
 |  * All of them are declared weak in this file, so can be overridden by code | 
 |  * that links against this object: | 
 |  * | 
 |  *   in_mod:   INPUT | 
 |  *             384 bytes | 
 |  *             The modulus | 
 |  * | 
 |  *   rr:    INPUT | 
 |  *             384 bytes | 
 |  *             The Montgomery transformation constant R^2 = (2^3072)^2 mod N | 
 |  */ | 
 |  | 
 | .text | 
 |  | 
 | /** | 
 |  * Subtracts the modulus M from A. | 
 |  * | 
 |  * Flags: After this subroutine, the C flag is set to 1 if the subtraction | 
 |  * underflowed. | 
 |  * | 
 |  * This routine runs in variable time. | 
 |  * @param[in]  x16: dmem pointer to first limb of modulus M | 
 |  * @param[in]  [w4:w15]: operand A | 
 |  * @param[in]  w31: all-zero | 
 |  * @param[out] [w16:w27]: result | 
 |  * | 
 |  * clobbered registers: x8 to x12, w2, w3, w16 to w27 | 
 |  * clobbered Flag Groups: FG0 | 
 |  */ | 
 | subtract_modulus_var: | 
 |  | 
 |   /* Prepare temporary registers. */ | 
 |   li        x8, 4 | 
 |   li        x9, 2 | 
 |   li        x10, 3 | 
 |   li        x11, 16 | 
 |  | 
 |   /* Copy pointer to modulus. */ | 
 |   addi      x12, x16, 0 | 
 |  | 
 |   /* Clear flags. */ | 
 |   bn.add    w31, w31, w31 | 
 |  | 
 |   /* Subtract M from A. */ | 
 |   loopi     12, 4 | 
 |     /* w2 <= A[i] */ | 
 |     bn.movr   x9, x8++ | 
 |     /* w3 <= M[i] */ | 
 |     bn.lid    x10, 0(x12++) | 
 |     /* w2 <= w2 - w3 */ | 
 |     bn.subb   w2, w2, w3 | 
 |     /* out[i] <= w2 */ | 
 |     bn.movr   x11++, x9 | 
 |  | 
 |   ret | 
 |  | 
 | /** | 
 |  * Doubles a number and reduces modulo M in-place. | 
 |  * | 
 |  *   Returns: C = (A + A) mod M | 
 |  * | 
 |  * Requires that A < M < 2^3072. Writes output to the A buffer in DMEM. This | 
 |  * routine runs in variable time. | 
 |  * | 
 |  * Flags: Flags have no meaning beyond the scope of this subroutine. | 
 |  * | 
 |  * @param[in]  x16: dmem pointer to first limb of modulus M | 
 |  * @param[in]  [w4:w15]: operand A | 
 |  * @param[in]  w31: all-zero | 
 |  * @param[out] [w4:w15]: result C | 
 |  * | 
 |  * clobbered registers: x2, x3, x7 to x12, w2 to w27 | 
 |  * clobbered Flag Groups: FG0 | 
 |  */ | 
 | double_mod_var: | 
 |   /* Save copy of pointer to modulus. */ | 
 |   addi      x12, x16, 0 | 
 |  | 
 |   /* Clear flags. */ | 
 |   bn.add    w31, w31, w31 | 
 |  | 
 |   /* Compute aa=(A + A). | 
 |        [w4:w15] <= (A+A) mod 2^3072 = aa[0:3071]*/ | 
 |   li        x9, 2 | 
 |   li        x10, 4 | 
 |   loopi     12, 3 | 
 |     /* w2 <= a[i] */ | 
 |     bn.movr   x9, x10 | 
 |     /* w2 <= w2 + w2 */ | 
 |     bn.addc   w2, w2, w2 | 
 |     /* aa[i] <= w2 */ | 
 |     bn.movr   x10++, x9 | 
 |  | 
 |   /* Extract final carry bit from flags register. | 
 |        x2 <= aa[3072] */ | 
 |   csrrs     x2, 0x7c0, x0 | 
 |   andi      x2, x2, 1 | 
 |  | 
 |   jal       x1, subtract_modulus_var | 
 |  | 
 |   /* Extract final borrow bit from flags register. */ | 
 |   csrrs     x3, 0x7c0, x0 | 
 |   andi      x3, x3, 1 | 
 |  | 
 |   /** | 
 |    * Select either aa or aa' based on carry/borrow bits. | 
 |    * | 
 |    * If aa < M, it follows that the carry bit aa[3072] = 0 (since M < 2^3072). | 
 |    * It also follows that the borrow from subtracting M must be 1. In this | 
 |    * case, select aa; otherwise, select aa-M. | 
 |    */ | 
 |  | 
 |   /* x2 <= (!x2) & x3 */ | 
 |   xori      x2, x2, 1 | 
 |   and       x2, x2, x3 | 
 |  | 
 |   /* Select aa if x2 = 0, otherwise aa-M. */ | 
 |   bne       x2, x0, sel_aa | 
 |  | 
 |   /* Copy subtraction result to w4:w15. */ | 
 |   li        x8, 4 | 
 |   li        x11, 16 | 
 |   loopi     12, 2 | 
 |     bn.movr   x8, x11++ | 
 |     addi      x8, x8, 1 | 
 |  | 
 | sel_aa: | 
 |  | 
 |   ret | 
 |  | 
 | /** | 
 |  * Computes the R^2 Montgomery constant and stores it in DMEM. | 
 |  * | 
 |  *   Returns RR = (2^3072)^2 mod M | 
 |  * | 
 |  * A note on bounds: This computation assumes that 2^3071 <= M < 2^3072. This | 
 |  * agrees with FIPS 186-4 section B.3.1 (page 53), which states that the prime | 
 |  * factors of the RSA modulus must be at least sqrt(2)*2^(nlen/2-1) (where nlen | 
 |  * is the key length, 3072 in this case). | 
 |  * | 
 |  * The result is stored in dmem[rr]. This routine runs in variable time. | 
 |  * | 
 |  * Flags: Flags have no meaning beyond the scope of this subroutine. | 
 |  * | 
 |  * @param[in]  dmem[in_mod] pointer to first limb of modulus M in dmem | 
 |  * @param[in]  dmem[m0inv]  Montgomery constant (-(M^-1) mod 2^256) | 
 |  * @param[out] dmem[rr]     Montgomery constant (R^2) mod M | 
 |  * | 
 |  * clobbered registers: x0 to x3, x6 to x13, x16, x17, x19 to x22, x24, | 
 |  *                      w2 to w31 | 
 |  * clobbered Flag Groups: FG0, FG1 | 
 |  */ | 
 |  .globl compute_rr | 
 | compute_rr: | 
 |   /* Prepare all-zero register. */ | 
 |   bn.xor    w31, w31, w31 | 
 |  | 
 |   /* Set pointers to DMEM buffers. */ | 
 |   la        x16, in_mod | 
 |   la        x17, m0inv | 
 |   la        x24, rr | 
 |  | 
 |   /* Zero [w4:w15]. */ | 
 |   li        x9, 4 | 
 |   li        x10, 31 | 
 |   loopi     12, 1 | 
 |     bn.movr   x9++, x10 | 
 |  | 
 |   /* w16 <= 1 */ | 
 |   bn.addi   w16, w31, 1 | 
 |  | 
 |   /* [w4:w15] <= [w4:w16] >> 1 = 2^3701 */ | 
 |   bn.rshi   w15, w16, w15 >> 1 | 
 |  | 
 |   /* Compute T = (2^96 * R) mod M = 2^96 (montgomery form). | 
 |      T = [w4:w15] = (2^97 * 2^3071) mod M = (2^96 * R) mod M */ | 
 |   loopi     97,2 | 
 |     jal x1, double_mod_var | 
 |     nop | 
 |  | 
 |   /* Store T in output buffer (in preparation for montmul). | 
 |      dmem[rr] <= [w4:w15] = T */ | 
 |   li        x8, 4 | 
 |   addi      x21, x24, 0 | 
 |   loopi     12, 2 | 
 |     bn.sid    x8, 0(x21++) | 
 |     addi      x8, x8, 1 | 
 |  | 
 |   /* Prepare pointers to temp regs for montmul. */ | 
 |   li        x9, 3 | 
 |   li        x10, 4 | 
 |   li        x11, 2 | 
 |  | 
 |   /* Prepare a pointer to the w4 register for storing the result. */ | 
 |   li        x8, 4 | 
 |  | 
 |   /* Five montgomery squares to compute RR = (T^(2^5) * R) mod M. */ | 
 |   loopi     5,9 | 
 |     /* [w4:w15] <= montmul(dmem[rr], dmem[rr]) */ | 
 |     addi      x19, x24, 0 | 
 |     addi      x20, x24, 0 | 
 |     jal       x1, montmul | 
 |     /* Store result: dmem[rr] <= [w4:w15] */ | 
 |     addi      x21, x24, 0 | 
 |     addi      x22, x8, 0 | 
 |     loopi     12, 2 | 
 |       bn.sid    x22, 0(x21++) | 
 |       addi      x22, x22, 1 | 
 |     nop | 
 |  | 
 |   ret | 
 |  | 
 | /* Input buffer for the Montgomery constant m0_inv. */ | 
 | .section .data.m0inv | 
 | .weak m0inv | 
 | m0inv: | 
 |   .zero 32 | 
 |  | 
 | /* Input buffer for the modulus. */ | 
 | .section .bss.in_mod | 
 | .weak in_mod | 
 | in_mod: | 
 |   .zero 384 | 
 |  | 
 | /* Output buffer for the Montgomery transformation constant R^2. */ | 
 | .section .bss.rr | 
 | .weak rr | 
 | rr: | 
 |   .zero 384 |