sw/otbn/code-snippets/p384.s - 3p/lowrisc/opentitan - Git at Google

 /* Copyright lowRISC contributors. */
 /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
 /* SPDX-License-Identifier: Apache-2.0 */
 /*
  *   P-384 specific routines
  */

  .section .text


 /**
  * 384-bit modular multiplication based on Barrett reduction algorithm
  * optimized for the special modulus of the NIST P-384 curve.
  *
  * Returns c = a x b % p.
  *
  * Expects: two operands, modulus p and pre-calculated parameter u for barrett
  * reduction (usually greek mu in literature). u is expected without the
  * leading 1 at bit 384. u has to be pre-calculated as u = floor(2^768/p).
  *
  * This implementation mostly follows the description in the
  * "Handbook of Applied Cryptography" in Algorithm 14.42.
  * Differences:
  *   - This implementation incorporates a multiplication before the reduction.
  *     Therefore it expects two operands (a, b) instead of a wider integer x.
  *   - The computation of q2 ignores the MSbs of q1 and u to allow using
  *     a 384x384 bit multiplication. This is compensated later by
  *     individual (conditional) additions.
  *   - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
  *     are not implemented here and the full register width is used. This
  *     allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
  *
  * Flags: Flags when leaving this subroutine depend on a potentially discarded
  *        value and therefore are not usable after return.
  *
  * @param[in] [w11, w10]: a, first operand, max. length 384 bit, a < m.
  * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
  * @param[in] [w13, w12]: p, modulus of P384 i.e.:
                              m = 2^384 - 2^128 - 2^96 + 2^32 - 1.
  * @param[in] [w15, w14]: u, pre-computed Barrett constant (without u[384]/MSb
  *                           of u which is always 1 for the allowed range but
  *                           has to be set to 0 here).
  * @param[in] w31: all-zero.
  * @param[out] [w17, w16]: c, result, max. length 384 bit.
  *
  * Clobbered registers: w10, w11, w16, w17, w18, w19, w20, w21, w22, w23, w24
  * Clobbered flag groups: FG0
  */
  .globl barrett384_p384
 barrett384_p384:
   /* Compute the integer product of the operands x = a * b
      x = [w18, w22, w21] = a * b = [w11, w10] * [w17, w16]
      => max. length x: 768 bit */
   bn.mulqacc.z          w10.0, w16.0,   0
   bn.mulqacc            w10.0, w16.1,  64
   bn.mulqacc.so w21.L,  w10.1, w16.0,  64
   bn.mulqacc            w10.0, w16.2,   0
   bn.mulqacc            w10.1, w16.1,   0
   bn.mulqacc            w10.2, w16.0,   0
   bn.mulqacc            w10.0, w16.3,  64
   bn.mulqacc            w10.1, w16.2,  64
   bn.mulqacc            w10.2, w16.1,  64
   bn.mulqacc.so w21.U,  w10.3, w16.0,  64
   bn.mulqacc            w10.0, w17.0,   0
   bn.mulqacc            w10.1, w16.3,   0
   bn.mulqacc            w10.2, w16.2,   0
   bn.mulqacc            w10.3, w16.1,   0
   bn.mulqacc            w11.0, w16.0,   0
   bn.mulqacc            w10.0, w17.1,  64
   bn.mulqacc            w10.1, w17.0,  64
   bn.mulqacc            w10.2, w16.3,  64
   bn.mulqacc            w10.3, w16.2,  64
   bn.mulqacc            w11.0, w16.1,  64
   bn.mulqacc.so w22.L,  w11.1, w16.0,  64
   bn.mulqacc            w10.1, w17.1,   0
   bn.mulqacc            w10.2, w17.0,   0
   bn.mulqacc            w10.3, w16.3,   0
   bn.mulqacc            w11.0, w16.2,   0
   bn.mulqacc            w11.1, w16.1,   0
   bn.mulqacc            w10.2, w17.1,  64
   bn.mulqacc            w10.3, w17.0,  64
   bn.mulqacc            w11.0, w16.3,  64
   bn.mulqacc.so w22.U,  w11.1, w16.2,  64
   bn.mulqacc            w10.3, w17.1,   0
   bn.mulqacc            w11.0, w17.0,   0
   bn.mulqacc            w11.1, w16.3,   0
   bn.mulqacc            w11.0, w17.1,  64
   bn.mulqacc.so w18.L,  w11.1, w17.0,  64
   bn.mulqacc.so w18.U,  w11.1, w17.1,   0

   /* Store correction factor to compensate for later neglected MSb of x.
      x is 768 bit wide and therefore the 383 bit right shifted version q1
      (below) contains 385 bit. Bit 384 of q1 is neglected to allow using a
      384x384 multiplier. For the MSb of x being set we temporary store u
      (or zero) here to be used in a later constant time correction of a
      multiplication with u. Note that this requires the MSb flag being carried
      over from the multiplication routine. */
   bn.sel w23, w14, w31, M
   bn.sel w24, w15, w31, M

   /* Compute q1 = x >> 383
      q1 = [w11, w10] = [w18, w22, w21] >> 383  = [w18, w21] >> 127
      => max length q1: 385 bits */
   bn.rshi w11, w31, w18 >> 127
   bn.rshi w10, w18, w22 >> 127

   /* Compute q2 = q1*u
      Instead of full q2 (which would be up to 770 bits) we ignore the MSb of u
      and the MSb of q1 and correct this later. This allows using a 384x384
      multiplier. We use the property that u for the modulus of P384 is zero in
      the bits 383 downto 129 and use a 384x192 multiplication routine.
      => max. length q2': 513 bit
      q2' = q1[383:0]*u[128:0] = [w18, w17, w16] = [w11, w10] * [w15, w14] */

   /* 576 = 384*192 bit multiplication kernel */
   bn.mulqacc.z          w10.0, w14.0,   0
   bn.mulqacc            w10.0, w14.1,  64
   bn.mulqacc.so w16.L,  w10.1, w14.0,  64
   bn.mulqacc            w10.0, w14.2,   0
   bn.mulqacc            w10.1, w14.1,   0
   bn.mulqacc            w10.2, w14.0,   0
   bn.mulqacc            w10.1, w14.2,  64
   bn.mulqacc            w10.2, w14.1,  64
   bn.mulqacc.so w16.U,  w10.3, w14.0,  64
   bn.mulqacc            w10.2, w14.2,   0
   bn.mulqacc            w10.3, w14.1,   0
   bn.mulqacc            w11.0, w14.0,   0
   bn.mulqacc            w10.3, w14.2,  64
   bn.mulqacc            w11.0, w14.1,  64
   bn.mulqacc.so w17.L,  w11.1, w14.0,  64
   bn.mulqacc            w11.0, w14.2,   0
   bn.mulqacc            w11.1, w14.1,   0
   bn.mulqacc.so w17.U,  w11.1, w14.2,  64

   /* w14.3 is always zero here due to structure of Barrett constant */
   bn.mulqacc.wo w18,    w11.1, w14.3,  64

   /* q3 = q2 >> 385
      In this step, the compensation for the neglected MSbs of q1 and u is
      carried out underway. To add them in the q2 domain, they would have to be
      left shifted by 384 bit first. To directly add them we first shift q2' by
      384 bit to the right, perform the additions, and shift the result another
      bit to the right. The additions cannot overflow due to leading zeros
      after shift.
      q2'' = q2' >> 384 = [w20, w19] = [w18, w17, w16] >> 384
                                     = [w18, w17] >> 128 */
   bn.rshi w20, w31, w18 >> 128
   bn.rshi w19, w18, w17 >> 128
   /* Add q1. This is unconditional since MSb of u is always 1.
      This cannot overflow due to leading zeros.
      q2''' = q2'' + q1 = [w20, w19] = [w20, w19] + [w10, w11] */
   bn.add w19, w19, w10
   bn.addc w20, w20, w11
   /* Conditionally add u (without leading 1) in case of MSb of x being set.
      This is the "real" q2 but shifted by 384 bits to the right. This cannot
      overflow due to leading zeros
      q2'''' = x[767]?q2'''+u[383:0]:q2'''
             = [w20, w19] + [w24, w23] = q2 >> 384 */
   bn.add w19, w19, w23
   bn.addc w20, w20, w24
   /* finally this gives q3 by shifting the remaining bit to the right
      q3 = q2 >> 385 = q2'''' >> 1 = [w11, w10] = [w20, w19] >> 1 */
   bn.rshi w11, w31, w20 >> 1
   bn.rshi w10, w20, w19 >> 1

   /* r2 = q3*m[511:0] = [w17, w16] = ([w11, w10] * [w13, w12])[511:0]
      A 384x384 bit multiplication kernel is used here, hence both q3 or p
      must not be wider than 384 bit. This is always the case for p. For q3 it
      is the case if a<p and b<p.
      The 256 highest bits of the multiplication result are not needed,
      so we do not compute them. */
   bn.mulqacc.z          w10.0, w12.0,   0
   bn.mulqacc            w10.0, w12.1,  64
   bn.mulqacc.so w16.L,  w10.1, w12.0,  64
   bn.mulqacc            w10.0, w12.2,   0
   bn.mulqacc            w10.1, w12.1,   0
   bn.mulqacc            w10.2, w12.0,   0
   bn.mulqacc            w10.0, w12.3,  64
   bn.mulqacc            w10.1, w12.2,  64
   bn.mulqacc            w10.2, w12.1,  64
   bn.mulqacc.so w16.U,  w10.3, w12.0,  64
   bn.mulqacc            w10.0, w13.0,   0
   bn.mulqacc            w10.1, w12.3,   0
   bn.mulqacc            w10.2, w12.2,   0
   bn.mulqacc            w10.3, w12.1,   0
   bn.mulqacc            w11.0, w12.0,   0
   bn.mulqacc            w10.0, w13.1,  64
   bn.mulqacc            w10.1, w13.0,  64
   bn.mulqacc            w10.2, w12.3,  64
   bn.mulqacc            w10.3, w12.2,  64
   bn.mulqacc            w11.0, w12.1,  64
   bn.mulqacc.so w17.L,  w11.1, w12.0,  64
   bn.mulqacc            w10.1, w13.1,   0
   bn.mulqacc            w10.2, w13.0,   0
   bn.mulqacc            w10.3, w12.3,   0
   bn.mulqacc            w11.0, w12.2,   0
   bn.mulqacc            w11.1, w12.1,   0
   bn.mulqacc            w10.2, w13.1,  64
   bn.mulqacc            w10.3, w13.0,  64
   bn.mulqacc            w11.0, w12.3,  64
   bn.mulqacc.so w17.U,  w11.1, w12.2,  64

   /* Compute r = x-r2 = x-q3*p
      since 0 <= r < 3*p, we only need to consider the lower limbs of x and r2
      r[511:0] = [w22, w21] - [w17, w16] */
   bn.sub w21, w21, w16
   bn.subb w22, w22, w17

   /* Barrett algorithm requires subtraction of the modulus at most two times if
      result is too large. However in the special case of P-384 we need to
      subtract only once */
   bn.sub w16, w21, w12
   bn.subb w17, w22, w13
   bn.sel w16, w21, w16, C
   bn.sel w17, w22, w17, C

   /* return result: c =[w17, w16] =  a * b % p. */
   ret


 /**
  * P-384 point addition in projective space
  *
  * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q)
  *         with R, P and Q being valid P-384 curve points
  *           in projective coordinates
  *
  * This routine adds two valid P-384 curve points in projective space.
  * Point addition is performed based on the complete formulas of Bosma and
  * Lenstra for Weierstrass curves as first published in [1] and
  * optimized in [2].
  * The implemented version follows Algorithm 4 of [2] which is an optimized
  * variant for Weierstrass curves with domain parameter 'a' set to a=-3.
  * Numbering of the steps below and naming of symbols follows the
  * terminology of Algorithm 4 of [2].
  * The routine is limited to P-384 curve points due to:
  *   - fixed a=-3 domain parameter
  *   - usage of a P-384 optimized Barrett multiplication kernel
  * This routine runs in constant time.
  *
  * [1] https://doi.org/10.1006/jnth.1995.1088
  * [2] https://doi.org/10.1007/978-3-662-49890-3_16
  *
  * @param[in]  x22: set to 10, pointer to in reg for Barrett routine
  * @param[in]  x23: set to 11, pointer to in reg for Barrett routine
  * @param[in]  x24: set to 16, pointer to in/out reg for Barrett routine
  * @param[in]  x25: set to 17, pointer to in/out reg for Barrett routine
  * @param[in]  x26: dptr_p_p, dmem pointer to point P in dmem (projective)
  * @param[in]  x27: dptr_q_p, dmem pointer to point Q in dmem (projective)
  * @param[in]  x28: dptr_b, dmem pointer to domain parameter b of P-384 in dmem
  * @param[in]  [w13, w12]: p, modulus of underlying field of P-384
  * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
  *                                    modulus p
  * @param[in]  w31: all-zero.
  * @param[out]  [w26, w25]: x_r, x-coordinate of resulting point R
  * @param[out]  [w28, w27]: y_r, y-coordinate of resulting point R
  * @param[out]  [w30, w29]: z_r, z-coordinate of resulting point R
  *
  * Flags: When leaving this subroutine, flags of FG0 depend on an
  *        intermediate result and are not usable after return.
  *        FG1 is not modified in this subroutine.
  *
  * clobbered registers: w0 to w30
  * clobbered flag groups: FG0
  */
 .globl proj_add_p384
 proj_add_p384:
   /* mapping of parameters to symbols of [2] (Algorithm 4):
      X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q
      X3 = x_r; Y3 = y_r; Z3 = z_r */

   /* 1: [w1, w0] = t0 <= X1*X2 = dmem[x26+0]*dmem[x27+0] */
   bn.lid    x22, 0(x26)
   bn.lid    x23, 32(x26)
   bn.lid    x24, 0(x27)
   bn.lid    x25, 32(x27)
   jal       x1, barrett384_p384
   bn.mov    w0, w16
   bn.mov    w1, w17

   /* 2: [w3, w2] = t1 <= Y1*Y2 = dmem[x26+64]*dmem[x27+64] */
   bn.lid    x22, 64(x26)
   bn.lid    x23, 96(x26)
   bn.lid    x24, 64(x27)
   bn.lid    x25, 96(x27)
   jal       x1, barrett384_p384
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 3: [w5, w4] = t2 <= Z1*Z2 = dmem[x26+128]*dmem[x27+128] */
   bn.lid    x22, 128(x26)
   bn.lid    x23, 160(x26)
   bn.lid    x24, 128(x27)
   bn.lid    x25, 160(x27)
   jal       x1, barrett384_p384
   bn.mov    w4, w16
   bn.mov    w5, w17

   /* 4: [w7, w6] = t3 <= X1+Y1 = dmem[x26+0]+dmem[x26+64] */
   bn.lid    x22, 0(x26)
   bn.lid    x23, 32(x26)
   bn.lid    x24, 64(x26)
   bn.lid    x25, 96(x26)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w6, w16
   bn.mov    w7, w17

   /* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */
   bn.lid    x22, 0(x27)
   bn.lid    x23, 32(x27)
   bn.lid    x24, 64(x27)
   bn.lid    x25, 96(x27)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */
   bn.mov    w10, w6
   bn.mov    w11, w7
   bn.mov    w16, w8
   bn.mov    w17, w9
   jal       x1, barrett384_p384
   bn.mov    w6, w16
   bn.mov    w7, w17

   /* 7: [w9, w8] = t4 <= t0+t1 = [w1, w0]+[w3, w2] */
   bn.add    w16, w0, w2
   bn.addc   w17, w1, w3
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */
   bn.sub    w16, w6, w8
   bn.subb   w17, w7, w9
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w6, w16
   bn.mov    w7, w17

   /* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */
   bn.lid    x22, 64(x26)
   bn.lid    x23, 96(x26)
   bn.lid    x24, 128(x26)
   bn.lid    x25, 160(x26)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */
   bn.lid    x22, 64(x27)
   bn.lid    x23, 96(x27)
   bn.lid    x24, 128(x27)
   bn.lid    x25, 160(x27)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */
   bn.mov    w10, w8
   bn.mov    w11, w9
   bn.mov    w16, w25
   bn.mov    w17, w26
   jal       x1, barrett384_p384
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 12: [w26, w25] = X3 <= t1+t2 = [w3, w2]+[w5, w4] */
   bn.add    w16, w2, w4
   bn.addc   w17, w3, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */
   bn.sub    w16, w8, w25
   bn.subb   w17, w9, w26
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */
   bn.lid    x22, 0(x26)
   bn.lid    x23, 32(x26)
   bn.lid    x24, 128(x26)
   bn.lid    x25, 160(x26)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */
   bn.lid    x22, 0(x27)
   bn.lid    x23, 32(x27)
   bn.lid    x24, 128(x27)
   bn.lid    x25, 160(x27)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */
   bn.mov    w10, w25
   bn.mov    w11, w26
   bn.mov    w16, w27
   bn.mov    w17, w28
   jal       x1, barrett384_p384
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 17: [w28, w27] = Y3 <= t0+t2 = [w1, w0]+[w5, w4] */
   bn.add    w16, w0, w4
   bn.addc   w17, w1, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */
   bn.sub    w16, w25, w27
   bn.subb   w17, w26, w28
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */
   bn.lid    x22, 0(x28)
   bn.lid    x23, 32(x28)
   bn.mov    w16, w4
   bn.mov    w17, w5
   jal       x1, barrett384_p384
   bn.mov    w29, w16
   bn.mov    w30, w17

   /* 20: [w26, w25] = X3 <= Y3-Z3 = [w28, w27]-[w30, w29] */
   bn.sub    w16, w27, w29
   bn.subb   w17, w28, w30
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */
   bn.add    w16, w25, w25
   bn.addc   w17, w26, w26
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w29, w16
   bn.mov    w30, w17

   /* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */
   bn.add    w16, w25, w29
   bn.addc   w17, w26, w30
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */
   bn.sub    w16, w2, w25
   bn.subb   w17, w3, w26
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w29, w16
   bn.mov    w30, w17

   /* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */
   bn.add    w16, w2, w25
   bn.addc   w17, w3, w26
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */
   bn.lid    x22, 0(x28)
   bn.lid    x23, 32(x28)
   bn.mov    w16, w27
   bn.mov    w17, w28
   jal       x1, barrett384_p384
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 26: [w3, w2] = t1 <= t2+t2 = [w5, w4]+[w5, w4] */
   bn.add    w16, w4, w4
   bn.addc   w17, w5, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */
   bn.add    w16, w2, w4
   bn.addc   w17, w3, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w4, w16
   bn.mov    w5, w17

   /* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */
   bn.sub    w16, w27, w4
   bn.subb   w17, w28, w5
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */
   bn.sub    w16, w27, w0
   bn.subb   w17, w28, w1
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */
   bn.add    w16, w27, w27
   bn.addc   w17, w28, w28
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */
   bn.add    w16, w2, w27
   bn.addc   w17, w3, w28
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */
   bn.add    w16, w0, w0
   bn.addc   w17, w1, w1
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */
   bn.add    w16, w2, w0
   bn.addc   w17, w3, w1
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w0, w16
   bn.mov    w1, w17

   /* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */
   bn.sub    w16, w0, w4
   bn.subb   w17, w1, w5
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w0, w16
   bn.mov    w1, w17

   /* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */
   bn.mov    w10, w8
   bn.mov    w11, w9
   bn.mov    w16, w27
   bn.mov    w17, w28
   jal       x1, barrett384_p384
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 36: [w5, w4] = t2 <= t0*Y3 = [w1, w0]*[w28, w27] */
   bn.mov    w10, w0
   bn.mov    w11, w1
   bn.mov    w16, w27
   bn.mov    w17, w28
   jal       x1, barrett384_p384
   bn.mov    w4, w16
   bn.mov    w5, w17

   /* 37: [w28, w27] = Y3 <= X3*Z3 = [w26, w25]*[w30, w29] */
   bn.mov    w10, w25
   bn.mov    w11, w26
   bn.mov    w16, w29
   bn.mov    w17, w30
   jal       x1, barrett384_p384
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 38: [w28, w27] = Y3 <= Y3+t2 = [w28, w27]+[w5, w4] */
   bn.add    w16, w27, w4
   bn.addc   w17, w28, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */
   bn.mov    w10, w6
   bn.mov    w11, w7
   bn.mov    w16, w25
   bn.mov    w17, w26
   jal       x1, barrett384_p384
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 40: [w26, w25] = X3 <= X3-t1 = [w26, w25]-[w3, w2] */
   bn.sub    w16, w25, w2
   bn.subb   w17, w26, w3
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */
   bn.mov    w10, w8
   bn.mov    w11, w9
   bn.mov    w16, w29
   bn.mov    w17, w30
   jal       x1, barrett384_p384
   bn.mov    w29, w16
   bn.mov    w30, w17

   /* 42: [w3, w2] = t1 <= t3*t0 = [w7, w6]*[w1, w0] */
   bn.mov    w10, w6
   bn.mov    w11, w7
   bn.mov    w16, w0
   bn.mov    w17, w1
   jal       x1, barrett384_p384
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 43: [w30, w29] = Z3 <= Z3+t1 = [w30, w29]+[w3, w2] */
   bn.add    w16, w29, w2
   bn.addc   w17, w30, w3
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w29, w16
   bn.mov    w30, w17

   ret


 /**
  * Convert projective coordinates of a P-384 curve point to affine coordinates
  *
  * returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
  *              where P is a valid P-384 curve point,
  *                    x_a and y_a are the resulting affine coordinates of the
  *                      curve point,
  *                    x,y and z are a set of projective coordinates of the
  *                      point and
  *                    p is the modulus of the P-384 underlying finite field.
  *
  * This routine computes the affine coordinates for a set of projective
  * coordinates of a valid P-384 curve point. The routine performs the required
  * divisions by computing the multiplicative modular inverse of the
  * projective z-coordinate in the underlying finite field of the P-384 curve.
  * For inverse computation Fermat's little theorem is used, i.e.
  * we compute z^-1 = z^(p-2) mod p.
  * For exponentiation a 16 step addition chain is used.
  * Source of the addition chain is the addchain project:
  * https://github.com/mmcloughlin/addchain/
  *
  * Flags: Flags when leaving this subroutine depend on a potentially discarded
  *        value and therefore are not usable after return.
  *
  * @param[in]  [w26,w25]: x, x-coordinate of curve point (projective).
  * @param[in]  [w26,w25]: y, y-coordinate of curve point (projective).
  * @param[in]  [w30,w29]: z, z-coordinate of curve point (projective).
  * @param[in]  [w13, w12]: p, modulus of P-384.
  * @param[in]  [w15, w14]: u, pre-computed Barrett constant for p,
  *                            lower 384 bits, i.e. (2^(2*384) div p)[383:0].
  * @param[in] w31: all-zero.
  * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
  * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
  *
  * clobbered registers: w0 to w28
  * clobbered flag groups: FG0
  */
 proj_to_affine_p384:

   /* Exp: 0b10 = 2*0b1
      Val: r10 = z^2 mod p
           [w17,w16] <= [w30,w29]^2 mod [w13,w12] */
   bn.mov    w10, w29
   bn.mov    w11, w30
   bn.mov    w16, w29
   bn.mov    w17, w30
   jal       x1, barrett384_p384

   /* Exp: 0b11 = 0b1+0b10
      Val: r11 <= z*r10 mod p
           [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
   bn.mov    w10, w29
   bn.mov    w11, w30
   jal       x1, barrett384_p384

   /* Exp: 0b110 = 2*0b11
      Val: r110 = r11^2 mod p
           [w17,w16] <= [w17,w16]^2 mod [w13,w12] */
   bn.mov    w10, w16
   bn.mov    w11, w17
   jal       x1, barrett384_p384

   /* Exp: 0b111 = 0b1+0b110
      Val: r111 <= z*r110  mod p
           [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
   bn.mov    w10, w29
   bn.mov    w11, w30
   jal       x1, barrett384_p384
   bn.mov    w0, w16
   bn.mov    w1, w17

   /* Exp: 0b111000 = 0b111<<3
      Val: r111000 <= r111^(2^3)  mod p
           [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
   loopi     3, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop

   /* Exp: 0b1111111 = 0b111+0b111000
      Val: r1111111 <= r111*r111000 mod p
           [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
   bn.mov    w10, w0
   bn.mov    w11, w1
   jal       x1, barrett384_p384
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* Exp: 2^12-1 = (0b1111111<<6)+0b111111
      Val: r_12_1 <= r111111^(2^6)*r111111 mod p
           [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
   loopi     6, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w2
   bn.mov    w11, w3
   jal       x1, barrett384_p384
   bn.mov    w4, w16
   bn.mov    w5, w17

   /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
      Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
           [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
   loopi     12, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w4
   bn.mov    w11, w5
   jal       x1, barrett384_p384

   /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
      Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
           [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
   loopi     6, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w2
   bn.mov    w11, w3
   jal       x1, barrett384_p384
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* Exp: 2^31-1 <= (2^30-1)*2+0b1
      Val: r_31_1 <= r30_1^2*z mod p
           [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
   bn.mov    w10, w16
   bn.mov    w11, w17
   jal       x1, barrett384_p384
   bn.mov    w10, w29
   bn.mov    w11, w30
   jal       x1, barrett384_p384
   bn.mov    w6, w16
   bn.mov    w7, w17

   /* Exp: 2^32-1 <= (2^30-1)*2+0b1
      Val: r_32_1 <= r31_1^2*z mod p
           [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
   bn.mov    w10, w16
   bn.mov    w11, w17
   jal       x1, barrett384_p384
   bn.mov    w10, w29
   bn.mov    w11, w30
   jal       x1, barrett384_p384
   bn.mov    w9, w16
   bn.mov    w8, w17

   /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
      Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
           [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
   loopi     31, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w6
   bn.mov    w11, w7
   jal       x1, barrett384_p384
   bn.mov    w6, w16
   bn.mov    w7,w17

   /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
      Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
           [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
   loopi     63, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w6
   bn.mov    w11, w7
   jal       x1, barrett384_p384
   bn.mov    w6, w16
   bn.mov    w7, w17

   /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
      Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
           [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
   loopi     126, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w6
   bn.mov    w11, w7
   jal       x1, barrett384_p384

   /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
      Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
           [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
   loopi     3, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w0
   bn.mov    w11, w1
   jal       x1, barrett384_p384

   /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
      Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
           [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
                        *[w30,w29] mod [w13,w12] */
   loopi     33, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w9
   bn.mov    w11, w8
   jal       x1, barrett384_p384
   loopi     94, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w2
   bn.mov    w11, w3
   jal       x1, barrett384_p384
   loopi     2, 4
     bn.mov    w10, w16
     bn.mov    w11, w17
     jal       x1, barrett384_p384
     nop
   bn.mov    w10, w29
   bn.mov    w11, w30
   jal       x1, barrett384_p384

   /* store inverse [w1,w0] <= [w17,w16] = z_inv*/
   bn.mov w0, w16
   bn.mov w1, w17

   /* convert x-coordinate to affine space
      [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
   bn.mov    w10, w25
   bn.mov    w11, w26
   jal       x1, barrett384_p384
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* convert y-coordinate to affine space
      [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
   bn.mov    w10, w27
   bn.mov    w11, w28
   bn.mov    w16, w0
   bn.mov    w17, w1
   jal       x1, barrett384_p384
   bn.mov    w27, w16
   bn.mov    w28, w17

   ret


 /**
  * Fetch curve point from dmem, randomize z-coordinate and store point in dmem
  *
  * returns P = (x, y, z) = (x_a*z, y_a*z, z)
  *         with P being a valid P-384 curve point in projective coordinates
  *              x_a and y_a being the affine coordinates as fetched from dmem
  *              z being a randomized z-coordinate
  *
  * This routines fetches the affine x- and y-coordinates of a curve point from
  * dmem and computes a valid set of projective coordinates. The z-coordinate is
  * randomized and x and y are scaled appropriately. The resulting projective
  * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
  * i.e. each coordinate is stored 512 bit aligned, little endian.
  * This routine runs in constant time.
  *
  * @param[in]  x20: dptr_x, pointer to dmem location containing affine
  *                          x-coordinate of input point
  * @param[in]  x21: dptr_y, pointer to dmem location containing affine
  *                          y-coordinate of input point
  * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
  *                                    modulus p
  * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
  * @param[in]  w31: all-zero
  * @param[in]  x18: dptr_p_p, pointer to dmem location to store resulting point
  *                            in projective space
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the upper limb of projective y-coordinate.
  *
  * clobbered registers: x10, x11 to x13
   *                     w2, w3, w8 to w11, w16 to w24, w29, w30
  * clobbered flag groups: FG0
  */
 store_proj_randomize:

   /* get a 384-bit random number
     [w3, w2] = random(384) */
   bn.wsrr   w2, 1
   bn.wsrr   w3, 1
   bn.rshi   w3, w31, w3 >> 128

   /* reduce random number
      [w2, w3] = z <= [w2, w3] mod p */
   bn.sub   w10, w2, w12
   bn.subb  w11, w3, w13
   bn.sel   w2, w2, w10, C
   bn.sel   w3, w3, w11, C

   bn.mov w10, w2
   bn.mov w11, w3

   /* store z-coordinate
      dmem[x20+128] = [w10, w11] */
   li        x10, 10
   li        x11, 11
   bn.sid    x10, 128(x18)
   bn.sid    x11, 160(x18)

   /* fetch x-coordinate from dmem
      [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
   li x12, 16
   li x13, 17
   bn.lid    x12,  0(x20)
   bn.lid    x13, 32(x20)

   /* scale and store x-coordinate
      [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
        x_p <= [w11, w10] * [w17, w16] = z*x  mod p */

   jal       x1, barrett384_p384
   bn.sid    x12,  0(x18)
   bn.sid    x13, 32(x18)

   /* fetch y-coordinate from dmem
      [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
   bn.lid    x12,  0(x21)
   bn.lid    x13, 32(x21)

   /* scale and store y-coordinate
      [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
        y_p <= [w11, w10] * [w17, w16] = z*y  mod p */
   bn.mov w10, w2
   bn.mov w11, w3
   jal       x1, barrett384_p384
   bn.sid    x12, 64(x18)
   bn.sid    x13, 96(x18)

   ret


 /**
  * P-256 scalar point multiplication in affine space
  *
  * returns R = d*P = d*(x_p, y_p)
  *         where R, P are valid P-256 curve points in affine coordinates,
  *               d is a 256-bit scalar.
  *
  * This routine performs scalar multiplication based on the group laws
  * of Weierstrass curves.
  * A constant time double-and-add algorithm (sometimes referred to as
  * double-and-add-always) is used.
  * Due to the P-384 optimized implementations of the internally called routines
  * for point addition and doubling, this routine is limited to P-384 curves.
  * The routine makes use of blinding by additive splitting the
  * exponent/scalar d into two shares. The double-and-add loop operates on both
  * shares in parallel applying Shamir's trick.
  *
  * @param[in]  x9: dptr_rnd, pointer to location in dmem containing random
  *                           number to be used for additive splitting of scalar
  * @param[in]  x19: dptr_d, pointer to scalar d (0 < d < n) in dmem
  * @param[in]  x20: dptr_x, pointer to affine x-coordinate in dmem
  * @param[in]  x21: dptr_y, pointer to affine y-coordinate in dmem
  * @param[in]  x28: dptr_b, pointer to domain parameter b of P-384 in dmem
  * @param[in]  x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
  * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u
  *                                  corresponding to modulus p
  * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
  * @param[in]  [w11, w10]: n, domain parameter of P-384 curve
  *                            (order of basepoint G)
  * @param[in]  w31: all-zero
  * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R.
  * @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R.
  *
  * Scratchpad memory layout:
  * The routine expects at least 704 bytes of scratchpad memory at dmem
  * location dptr_sp. Internally the scratchpad is used as follows:
  * dptr_sp     .. dptr_sp+192: point P, projective
  * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
  * dptr_sp+256 .. dptr_sp+447: point 2P, projective
  * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
  * dptr_sp+512 .. dptr_sp+703: point Q, projective
  *
  * Projective coordinates of a point are kept in dmem in little endian format
  * with the individual coordinates 512 bit aligned. The coordinates are stored
  * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
  * curve point occupies 6 consecutive 256-bit dmem cells.
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the computed affine y-coordinate.
  *
  * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
  * clobbered flag groups: FG0
  */
 .globl scalar_mult_int_p384
 scalar_mult_int_p384:

   /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
      resetting in very call to point addition routine */
   li        x22, 10
   li        x23, 11
   li        x24, 16
   li        x25, 17

   /* fetch externally supplied random number from dmem
      [w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */
   li        x2, 0
   bn.lid    x2++, 0(x9)
   bn.lid    x2++, 32(x9)

   /* 1st share (reduced rnd)
      s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */
   bn.sub    w9, w0, w10
   bn.subb   w8, w1, w11
   bn.sel    w0, w0, w9, C
   bn.sel    w1, w1, w8, C

   /* load scalar d from dmem
      [w3, w2] = d <= dmem[dptr_d] = [dmem[x19], dmem[x19+32]] */
   bn.lid    x2++, 0(x19)
   bn.lid    x2, 32(x19)

   /* 2nd share (d-s0)
      s1 = [w3, w2] <= d - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */
   bn.sub    w2, w2, w0
   bn.subb   w3, w3, w1
   bn.add    w8, w2, w10
   bn.addc   w9, w3, w11
   bn.sel    w2, w8, w2, C
   bn.sel    w3, w9, w3, C

   /* left align both shares for probing of MSB in loop body */
   bn.rshi   w1, w1, w0 >> 128
   bn.rshi   w0, w0, w31 >> 128
   bn.rshi   w3, w3, w2 >> 128
   bn.rshi   w2, w2, w31 >> 128

    /* store shares in scratchpad */
   li        x2, 0
   bn.sid    x2++, 192(x30)
   bn.sid    x2++, 224(x30)
   bn.sid    x2++, 448(x30)
   bn.sid    x2++, 480(x30)

   /* get randomized projective coodinates of curve point
      P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
   add       x18, x30, 0
   jal       x1, store_proj_randomize

   /* double point P
      2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
   add       x27, x30, x0
   add       x26, x30, x0
   jal       x1, proj_add_p384

   /* store point 2P in scratchpad @w30+256
      dmem[dptr_sc+256] = [w30:w25] = 2P */
   li        x2, 25
   bn.sid    x2++, 256(x30)
   bn.sid    x2++, 288(x30)
   bn.sid    x2++, 320(x30)
   bn.sid    x2++, 352(x30)
   bn.sid    x2++, 384(x30)
   bn.sid    x2++, 416(x30)

   /* init point Q = (0,1,0) for double-and-add in scratchpad */
   /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
   addi      x26, x30, 512
   li        x2, 30
   bn.addi   w30, w31, 1
   bn.sid    x2++, 64(x26)
   bn.sid    x2, 0(x26)
   bn.sid    x2, 32(x26)
   bn.sid    x2, 96(x26)
   bn.sid    x2, 128(x26)
   bn.sid    x2, 160(x26)

   /* double-and-add loop with decreasing index */
   loopi     384, 85

     /* double point Q
        Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
     add       x27, x26, x0
     jal       x1, proj_add_p384

     /* store Q in dmem
      dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
     li        x2, 25
     bn.sid    x2++, 0(x26)
     bn.sid    x2++, 32(x26)
     bn.sid    x2++, 64(x26)
     bn.sid    x2++, 96(x26)
     bn.sid    x2++, 128(x26)
     bn.sid    x2++, 160(x26)

     /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
        is 1.
        If only one MSb is set, select P for addition.
        If both MSbs are set, select 2P for addition.
        (If neither MSB is set, 2P will be selected but result discarded.) */
     li        x2, 0
     bn.lid    x2++, 224(x30)
     bn.lid    x2, 480(x30)
     bn.xor    w8, w0, w1
     /* Create conditional offeset into scratchpad.
        if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
     csrrs     x3, 0x7c0, x0
     xori      x3, x3, -1
     andi      x3, x3, 2
     slli      x27, x3, 7
     add       x27, x27, x30

     /* Reload randomized projective coodinates for curve point P.
        P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
     jal       x1, store_proj_randomize

     /* Add points Q+P or Q+2P depending on offset in x27.
        Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
     jal       x1, proj_add_p384

     /* load shares from scratchpad
        [w1, w0] = s0; [w3, w2] = s1 */
     li        x2, 0
     bn.lid    x2++, 192(x30)
     bn.lid    x2++, 224(x30)
     bn.lid    x2++, 448(x30)
     bn.lid    x2++, 480(x30)

     /* M = s0[511] | s1[511] */
     bn.or     w8, w1, w3

     /* load q from scratchpad
         Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
     li        x2, 4
     bn.lid    x2++, 0(x26)
     bn.lid    x2++, 32(x26)
     bn.lid    x2++, 64(x26)
     bn.lid    x2++, 96(x26)
     bn.lid    x2++, 128(x26)
     bn.lid    x2++, 160(x26)

     /* select either Q or Q_a
        if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
     bn.sel    w25, w25, w4, M
     bn.sel    w26, w26, w5, M
     bn.sel    w27, w27, w6, M
     bn.sel    w28, w28, w7, M
     bn.sel    w29, w29, w8, M
     bn.sel    w30, w30, w9, M

     /* store Q in dmem
      dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
     li        x2, 25
     bn.sid    x2++, 0(x26)
     bn.sid    x2++, 32(x26)
     bn.sid    x2++, 64(x26)
     bn.sid    x2++, 96(x26)
     bn.sid    x2++, 128(x26)
     bn.sid    x2++, 160(x26)

     /* left shift both shares
        s0 <= s0 << 1 ; s1 <= s1 << 1 */
     bn.add    w0, w0, w0
     bn.addc   w1, w1, w1
     bn.add    w2, w2, w2
     bn.addc   w3, w3, w3
     /* store both shares in scratchpad */
     li        x2, 0
     bn.sid    x2++, 192(x30)
     bn.sid    x2++, 224(x30)
     bn.sid    x2++, 448(x30)
     bn.sid    x2++, 480(x30)


     /* Get a fresh random number and scale the coordinates of 2P.
        (scaling each proj. coordinate by same factor results in same point) */

     /* get a 384-bit random number */
     bn.wsrr   w2, 1
     bn.wsrr   w3, 1
     bn.rshi   w3, w31, w3 >> 128

     /* reduce random number
       [w2, w3] = z <= [w2, w3] mod p */
     bn.sub    w10, w2, w12
     bn.subb   w11, w3, w13
     bn.sel    w2, w2, w10, C
     bn.sel    w3, w3, w11, C

     /* scale all coordinates in scratchpad */
     li        x2, 16
     li        x3, 17
     /* x-coordinate */
     bn.mov    w10, w2
     bn.mov    w11, w3
     bn.lid    x2, 256(x30)
     bn.lid    x3, 288(x30)
     jal       x1, barrett384_p384
     bn.sid    x2, 256(x30)
     bn.sid    x3, 288(x30)
     /* y-coordinate */
     bn.mov    w10, w2
     bn.mov    w11, w3
     bn.lid    x2, 320(x30)
     bn.lid    x3, 352(x30)
     jal       x1, barrett384_p384
     bn.sid    x2, 320(x30)
     bn.sid    x3, 352(x30)
     /* z-coordinate */
     bn.mov    w10, w2
     bn.mov    w11, w3
     bn.lid    x2, 384(x30)
     bn.lid    x3, 416(x30)
     jal       x1, barrett384_p384
     bn.sid    x2, 384(x30)
     bn.sid    x3, 416(x30)

   /* convert coordinates to affine space */
   jal       x1, proj_to_affine_p384

   ret