sw/otbn/crypto/p384_base.s - 3p/lowrisc/opentitan - Git at Google

 /* Copyright lowRISC contributors. */
 /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
 /* SPDX-License-Identifier: Apache-2.0 */
 /*
  *   This library contains:
  *   - P-384 specific routines for point addition in projective space
  *   - P-384 domain parameters
  */

  .section .text

 /**
  * Unrolled 768=384x384 bit multiplication.
  *
  * Returns c = a x b.
  *
  * This routine runs in constant time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m.
  * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
  * @param[in] w31: all-zero.
  * @param[out] [w20:w18]: c, result, max. length 768 bit.
  *
  * Clobbered registers: w18 to w20
  * Clobbered flag groups: FG0
  */
 mul384:
   bn.mulqacc.z          w10.0, w16.0,   0
   bn.mulqacc            w10.0, w16.1,  64
   bn.mulqacc.so w18.L,  w10.1, w16.0,  64
   bn.mulqacc            w10.0, w16.2,   0
   bn.mulqacc            w10.1, w16.1,   0
   bn.mulqacc            w10.2, w16.0,   0
   bn.mulqacc            w10.0, w16.3,  64
   bn.mulqacc            w10.1, w16.2,  64
   bn.mulqacc            w10.2, w16.1,  64
   bn.mulqacc.so w18.U,  w10.3, w16.0,  64
   bn.mulqacc            w10.0, w17.0,   0
   bn.mulqacc            w10.1, w16.3,   0
   bn.mulqacc            w10.2, w16.2,   0
   bn.mulqacc            w10.3, w16.1,   0
   bn.mulqacc            w11.0, w16.0,   0
   bn.mulqacc            w10.0, w17.1,  64
   bn.mulqacc            w10.1, w17.0,  64
   bn.mulqacc            w10.2, w16.3,  64
   bn.mulqacc            w10.3, w16.2,  64
   bn.mulqacc            w11.0, w16.1,  64
   bn.mulqacc.so w19.L,  w11.1, w16.0,  64
   bn.mulqacc            w10.1, w17.1,   0
   bn.mulqacc            w10.2, w17.0,   0
   bn.mulqacc            w10.3, w16.3,   0
   bn.mulqacc            w11.0, w16.2,   0
   bn.mulqacc            w11.1, w16.1,   0
   bn.mulqacc            w10.2, w17.1,  64
   bn.mulqacc            w10.3, w17.0,  64
   bn.mulqacc            w11.0, w16.3,  64
   bn.mulqacc.so w19.U,  w11.1, w16.2,  64
   bn.mulqacc            w10.3, w17.1,   0
   bn.mulqacc            w11.0, w17.0,   0
   bn.mulqacc            w11.1, w16.3,   0
   bn.mulqacc            w11.0, w17.1,  64
   bn.mulqacc.so w20.L,  w11.1, w17.0,  64
   bn.mulqacc.so w20.U,  w11.1, w17.1,   0

   ret


 /**
  * 384-bit modular multiplication based on Solinas reduction algorithm.
  *
  * Returns c = a x b % p.
  *
  * This subroutine is specialized to the coordinate field of P-384 and cannot
  * be used for other moduli.
  *
  * Solinas reduction is based on the observation that if the modulus has the
  * form (2^384 - K), then for all x and y:
  *   (x + 2^384 * y) mod (2^384 - K) = (x + K * y) mod (2^384 - K).
  *
  * For P-384, the constant K is: (2^128 + 2^96 - 2^32 + 1). A "Solinas
  * reduction step" consists of splitting a large number (such as the result of
  * a multiplication) into two parts: the lowest 384 bits (x in the formula
  * above) and any bits above that point (y in the formula above), then
  * multiplying y by K and adding it to x. Because of K's special form, the
  * multiplication by K for the P-384 modulus is especially quick.
  *
  * This routine runs in constant time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m.
  * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
  * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384.
  * @param[in] w31: all-zero.
  * @param[out] [w17, w16]: c, result, max. length 384 bit.
  *
  * Clobbered registers: w16 to w24
  * Clobbered flag groups: FG0
  */
 .globl p384_mulmod_p
 p384_mulmod_p:
   /* Compute the raw 768-bit product:
        ab = [w20:w18] <= a * b */
   jal     x1, mul384

   /* Solinas reduction step. Based on the observation that:
      (x + 2^384 * y) mod (2^384 - K) = (x + K * y) mod (2^384 - K).

     For P-384, the constant K = 2^384 - modulus is: (2^128 + 2^96 - 2^32 + 1)
    */

   /* Extract the high 128 bits from the middle term and the low 128 bits from
      the high term:
        w21 <= ab[639:384] */
   bn.rshi w21, w20, w19 >> 128

   /* Multiply by K:
      [w24:w23] <= w21 + (w21 << 128) + (w21 << 96) - (w21 << 32) = ab[639:384] * K */
   bn.add  w23, w21, w21 << 128
   bn.addc w24, w31, w21 >> 128
   bn.add  w23, w23, w21 << 96
   bn.addc w24, w24, w21 >> 160
   bn.sub  w23, w23, w21 << 32
   bn.subb w24, w24, w21 >> 224

   /* Construct a 256-bit mask:
        w22 <= 2^256 - 1 */
   bn.not  w22, w31

   /* Isolate the lower 384 bits:
        w19 <= ab[383:256] */
   bn.and  w19, w19, w22 >> 128

   /* Add product to the lower 384 bits:
        [w19:w18] = ab[383:0] + (ab[639:384] * K) */
   bn.add  w18, w18, w23
   bn.addc w19, w19, w24

   /* Isolate the highest 128 bits of the product:
        [w24:w23] <= ab[767:640] */
   bn.rshi w21, w31, w20 >> 128

   /* Multiply by K:
      [w24:w23] <= w21 + (w21 << 128) + (w21 << 96) - (w21 << 32) = ab[767:640] * K */
   bn.add  w23, w21, w21 << 128
   bn.addc w24, w31, w21 >> 128
   bn.add  w23, w23, w21 << 96
   bn.addc w24, w24, w21 >> 160
   bn.sub  w23, w23, w21 << 32
   bn.subb w24, w24, w21 >> 224

   /* Add product to the result to complete the reduction step:
        [w20:w18] = ab[383:0] + (ab[767:384] * K) */
   bn.add  w19, w19, w23
   bn.addc w20, w31, w24

   /* At this point, the intermediate result r is max. 576 bits, because:
        ab[383:0]: 384 bits
        ab[767:384]: 384 bits
        ab[767:384] * K : 575 bits
        r = ab[383:0] + ab[767:384] * K : 576 bits

     Start another Solinas step to reduce the bound further. */

   /* Extract the high 192 bits:
        w21 <= r[575:384] * K */
   bn.rshi w21, w20, w19 >> 128

   /* Multiply by K:
      [w24:w23] <= w21 + (w21 << 128) + (w21 << 96) - (w21 << 32) = r[575:384] * K */
   bn.add  w23, w21, w21 << 128
   bn.addc w24, w31, w21 >> 128
   bn.add  w23, w23, w21 << 96
   bn.addc w24, w24, w21 >> 160
   bn.sub  w23, w23, w21 << 32
   bn.subb w24, w24, w21 >> 224

   /* Isolate the lower 384 bits:
        w19 <= r[383:256] */
   bn.and  w19, w19, w22 >> 128

   /* Add product to the lower 384 bits to complete the reduction step:
        [w19:w18] = r[383:0] + (r[575:384] * K) */
   bn.add  w18, w18, w23
   bn.addc w19, w19, w24

   /* At this point, the result is at most 385 bits, and a conditional
      subtraction is sufficient to fully reduce. */
   bn.sub  w16, w18, w12
   bn.subb w17, w19, w13

   /* If the subtraction underflowed (C is set), select the pre-subtraction
      result; otherwise, select the result of the subtraction. */
   bn.sel w16, w18, w16, C
   bn.sel w17, w19, w17, C

   /* return result: c =[w17, w16] =  a * b % m. */
   ret

 /**
  * 384-bit modular multiplication based on Solinas reduction algorithm.
  *
  * Returns c = a x b % m.
  *
  * This subroutine is intended for use with the group order (n) of P-384, but
  * will work for any modulus m such that 2^384 - 2^191 < m < 2^384.
  *
  * Solinas reduction is based on the observation that if the modulus has the
  * form (2^384 - K), then for all x and y:
  *   (x + 2^384 * y) mod (2^384 - K) = (x + K * y) mod (2^384 - K).
  *
  * A "Solinas reduction step" consists of splitting a large number (such as the
  * result of a multiplication) into two parts: the lowest 384 bits (x in the
  * formula above) and any bits above that point (y in the formula above), then
  * multiplying y by K and adding it to x.
  *
  * This routine runs in constant time.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m.
  * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
  * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384.
  * @param[in] w14: k, Solinas constant (2^384 - modulus), max. length 191 bit.
  * @param[in] w31: all-zero.
  * @param[out] [w17, w16]: c, result, max. length 384 bit.
  *
  * Clobbered registers: w16 to w24
  * Clobbered flag groups: FG0
  */
 .globl p384_mulmod_n
 p384_mulmod_n:
   /* Compute the raw 768-bit product:
        ab = [w20:w18] <= a * b */
   jal     x1, mul384

   /* Solinas reduction step. Based on the observation that:
      (x + 2^384 * y) mod (2^384 - K) = (x + K * y) mod (2^384 - K). */

   /* Extract the high 128 bits from the middle term and the low 128 bits from
      the high term:
        w21 <= ab[639:384] */
   bn.rshi w21, w20, w19 >> 128

   /* Multiply by K (256bx192b multiplication):
        [w24:w23] <= w21 * w14 = ab[639:384] * K */
   bn.mulqacc.z          w21.0, w14.0,   0
   bn.mulqacc            w21.0, w14.1,  64
   bn.mulqacc.so w23.L,  w21.1, w14.0,  64
   bn.mulqacc            w21.0, w14.2,   0
   bn.mulqacc            w21.1, w14.1,   0
   bn.mulqacc            w21.2, w14.0,   0
   bn.mulqacc            w21.1, w14.2,  64
   bn.mulqacc            w21.2, w14.1,  64
   bn.mulqacc.so w23.U,  w21.3, w14.0,  64
   bn.mulqacc            w21.2, w14.2,   0
   bn.mulqacc            w21.3, w14.1,   0
   bn.mulqacc.wo w24,    w21.3, w14.2,  64

   /* Construct a 256-bit mask:
        w22 <= 2^256 - 1 */
   bn.not  w22, w31

   /* Isolate the lower 384 bits:
        w19 <= ab[383:256] */
   bn.and  w19, w19, w22 >> 128

   /* Add product to the lower 384 bits:
        [w19:w18] = ab[383:0] + (ab[639:384] * K) */
   bn.add  w18, w18, w23
   bn.addc w19, w19, w24

   /* Isolate the highest 128 bits of the product:
        [w24:w23] <= ab[767:640] */
   bn.rshi w21, w31, w20 >> 128

   /* Multiply by K (128bx192b multiplication):
      [w24:w23] <=  ab[767:640] * K */
   bn.mulqacc.z          w21.0, w14.0,   0
   bn.mulqacc            w21.0, w14.1,  64
   bn.mulqacc.so w23.L,  w21.1, w14.0,  64
   bn.mulqacc            w21.0, w14.2,   0
   bn.mulqacc            w21.1, w14.1,   0
   bn.mulqacc.so w23.U,  w21.1, w14.2,  64
   /* Write remaining accumulator to w24; multiply by known zeroes to avoid
      changing the accumulator. */
   bn.mulqacc.wo w24,    w31.0, w31.0,   0

   /* Add product to the result to complete the reduction step:
        [w20:w18] = ab[383:0] + (ab[767:384] * K) */
   bn.add  w19, w19, w23
   bn.addc w20, w31, w24

   /* At this point, the intermediate result r is max. 576 bits, because:
        ab[383:0]: 384 bits
        ab[767:384]: 384 bits
        ab[767:384] * K : 575 bits
        r = ab[383:0] + ab[767:384] * K : 576 bits

     Start another Solinas step to reduce the bound further. */

   /* Extract the high 192 bits:
        w21 <= r[575:384] * K */
   bn.rshi w21, w20, w19 >> 128

   /* Multiply by K (192bx192b multiplication):
        [w24:w23] <= w21 * w14 = r[575:384] * K */
   bn.mulqacc.z          w21.0, w14.0,   0
   bn.mulqacc            w21.0, w14.1,  64
   bn.mulqacc.so w23.L,  w21.1, w14.0,  64
   bn.mulqacc            w21.0, w14.2,   0
   bn.mulqacc            w21.1, w14.1,   0
   bn.mulqacc            w21.2, w14.0,   0
   bn.mulqacc            w21.1, w14.2,  64
   bn.mulqacc.so w23.U,  w21.2, w14.1,  64
   bn.mulqacc.wo w24,    w21.2, w14.2,   0

   /* Isolate the lower 384 bits:
        w19 <= r[383:256] */
   bn.and  w19, w19, w22 >> 128

   /* Add product to the lower 384 bits to complete the reduction step:
        [w19:w18] = r[383:0] + (r[575:384] * K) */
   bn.add  w18, w18, w23
   bn.addc w19, w19, w24

   /* At this point, the result is at most 385 bits, and a conditional
      subtraction is sufficient to fully reduce. */
   bn.sub  w16, w18, w12
   bn.subb w17, w19, w13

   /* If the subtraction underflowed (C is set), select the pre-subtraction
      result; otherwise, select the result of the subtraction. */
   bn.sel w16, w18, w16, C
   bn.sel w17, w19, w17, C

   /* return result: c =[w17, w16] =  a * b % m. */
   ret

 /**
  * P-384 point addition in projective space
  *
  * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q)
  *         with R, P and Q being valid P-384 curve points
  *           in projective coordinates
  *
  * This routine adds two valid P-384 curve points in projective space.
  * Point addition is performed based on the complete formulas of Bosma and
  * Lenstra for Weierstrass curves as first published in [1] and
  * optimized in [2].
  * The implemented version follows Algorithm 4 of [2] which is an optimized
  * variant for Weierstrass curves with domain parameter 'a' set to a=-3.
  * Numbering of the steps below and naming of symbols follows the
  * terminology of Algorithm 4 of [2].
  * The routine is limited to P-384 curve points due to:
  *   - fixed a=-3 domain parameter
  *   - usage of a P-384 optimized Barrett multiplication kernel
  * This routine runs in constant time.
  *
  * [1] https://doi.org/10.1006/jnth.1995.1088
  * [2] https://doi.org/10.1007/978-3-662-49890-3_16
  *
  * @param[in]  x22: set to 10, pointer to in reg for modular multiplication
  * @param[in]  x23: set to 11, pointer to in reg for modular multiplication
  * @param[in]  x24: set to 16, pointer to in/out reg for modular multiplication
  * @param[in]  x25: set to 17, pointer to in/out reg for modular multiplication
  * @param[in]  x26: dptr_p_p, dmem pointer to point P in dmem (projective)
  * @param[in]  x27: dptr_q_p, dmem pointer to point Q in dmem (projective)
  * @param[in]  x28: dptr_b, dmem pointer to domain parameter b of P-384 in dmem
  * @param[in]  [w13, w12]: p, modulus of underlying field of P-384
  * @param[in]  w31: all-zero.
  * @param[out]  [w26, w25]: x_r, x-coordinate of resulting point R
  * @param[out]  [w28, w27]: y_r, y-coordinate of resulting point R
  * @param[out]  [w30, w29]: z_r, z-coordinate of resulting point R
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * clobbered registers: w0 to w30
  * clobbered flag groups: FG0
  */
 .globl proj_add_p384
 proj_add_p384:
   /* mapping of parameters to symbols of [2] (Algorithm 4):
      X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q
      X3 = x_r; Y3 = y_r; Z3 = z_r */

   /* 1: [w1, w0] = t0 <= X1*X2 = dmem[x26+0]*dmem[x27+0] */
   bn.lid    x22, 0(x26)
   bn.lid    x23, 32(x26)
   bn.lid    x24, 0(x27)
   bn.lid    x25, 32(x27)
   jal       x1, p384_mulmod_p
   bn.mov    w0, w16
   bn.mov    w1, w17

   /* 2: [w3, w2] = t1 <= Y1*Y2 = dmem[x26+64]*dmem[x27+64] */
   bn.lid    x22, 64(x26)
   bn.lid    x23, 96(x26)
   bn.lid    x24, 64(x27)
   bn.lid    x25, 96(x27)
   jal       x1, p384_mulmod_p
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 3: [w5, w4] = t2 <= Z1*Z2 = dmem[x26+128]*dmem[x27+128] */
   bn.lid    x22, 128(x26)
   bn.lid    x23, 160(x26)
   bn.lid    x24, 128(x27)
   bn.lid    x25, 160(x27)
   jal       x1, p384_mulmod_p
   bn.mov    w4, w16
   bn.mov    w5, w17

   /* 4: [w7, w6] = t3 <= X1+Y1 = dmem[x26+0]+dmem[x26+64] */
   bn.lid    x22, 0(x26)
   bn.lid    x23, 32(x26)
   bn.lid    x24, 64(x26)
   bn.lid    x25, 96(x26)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w6, w16
   bn.mov    w7, w17

   /* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */
   bn.lid    x22, 0(x27)
   bn.lid    x23, 32(x27)
   bn.lid    x24, 64(x27)
   bn.lid    x25, 96(x27)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */
   bn.mov    w10, w6
   bn.mov    w11, w7
   bn.mov    w16, w8
   bn.mov    w17, w9
   jal       x1, p384_mulmod_p
   bn.mov    w6, w16
   bn.mov    w7, w17

   /* 7: [w9, w8] = t4 <= t0+t1 = [w1, w0]+[w3, w2] */
   bn.add    w16, w0, w2
   bn.addc   w17, w1, w3
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */
   bn.sub    w16, w6, w8
   bn.subb   w17, w7, w9
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w6, w16
   bn.mov    w7, w17

   /* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */
   bn.lid    x22, 64(x26)
   bn.lid    x23, 96(x26)
   bn.lid    x24, 128(x26)
   bn.lid    x25, 160(x26)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */
   bn.lid    x22, 64(x27)
   bn.lid    x23, 96(x27)
   bn.lid    x24, 128(x27)
   bn.lid    x25, 160(x27)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */
   bn.mov    w10, w8
   bn.mov    w11, w9
   bn.mov    w16, w25
   bn.mov    w17, w26
   jal       x1, p384_mulmod_p
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 12: [w26, w25] = X3 <= t1+t2 = [w3, w2]+[w5, w4] */
   bn.add    w16, w2, w4
   bn.addc   w17, w3, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */
   bn.sub    w16, w8, w25
   bn.subb   w17, w9, w26
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w8, w16
   bn.mov    w9, w17

   /* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */
   bn.lid    x22, 0(x26)
   bn.lid    x23, 32(x26)
   bn.lid    x24, 128(x26)
   bn.lid    x25, 160(x26)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */
   bn.lid    x22, 0(x27)
   bn.lid    x23, 32(x27)
   bn.lid    x24, 128(x27)
   bn.lid    x25, 160(x27)
   bn.add    w16, w10, w16
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */
   bn.mov    w10, w25
   bn.mov    w11, w26
   bn.mov    w16, w27
   bn.mov    w17, w28
   jal       x1, p384_mulmod_p
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 17: [w28, w27] = Y3 <= t0+t2 = [w1, w0]+[w5, w4] */
   bn.add    w16, w0, w4
   bn.addc   w17, w1, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */
   bn.sub    w16, w25, w27
   bn.subb   w17, w26, w28
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */
   bn.lid    x22, 0(x28)
   bn.lid    x23, 32(x28)
   bn.mov    w16, w4
   bn.mov    w17, w5
   jal       x1, p384_mulmod_p
   bn.mov    w29, w16
   bn.mov    w30, w17

   /* 20: [w26, w25] = X3 <= Y3-Z3 = [w28, w27]-[w30, w29] */
   bn.sub    w16, w27, w29
   bn.subb   w17, w28, w30
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */
   bn.add    w16, w25, w25
   bn.addc   w17, w26, w26
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w29, w16
   bn.mov    w30, w17

   /* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */
   bn.add    w16, w25, w29
   bn.addc   w17, w26, w30
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */
   bn.sub    w16, w2, w25
   bn.subb   w17, w3, w26
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w29, w16
   bn.mov    w30, w17

   /* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */
   bn.add    w16, w2, w25
   bn.addc   w17, w3, w26
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */
   bn.lid    x22, 0(x28)
   bn.lid    x23, 32(x28)
   bn.mov    w16, w27
   bn.mov    w17, w28
   jal       x1, p384_mulmod_p
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 26: [w3, w2] = t1 <= t2+t2 = [w5, w4]+[w5, w4] */
   bn.add    w16, w4, w4
   bn.addc   w17, w5, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */
   bn.add    w16, w2, w4
   bn.addc   w17, w3, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w4, w16
   bn.mov    w5, w17

   /* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */
   bn.sub    w16, w27, w4
   bn.subb   w17, w28, w5
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */
   bn.sub    w16, w27, w0
   bn.subb   w17, w28, w1
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */
   bn.add    w16, w27, w27
   bn.addc   w17, w28, w28
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */
   bn.add    w16, w2, w27
   bn.addc   w17, w3, w28
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */
   bn.add    w16, w0, w0
   bn.addc   w17, w1, w1
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */
   bn.add    w16, w2, w0
   bn.addc   w17, w3, w1
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w0, w16
   bn.mov    w1, w17

   /* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */
   bn.sub    w16, w0, w4
   bn.subb   w17, w1, w5
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w0, w16
   bn.mov    w1, w17

   /* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */
   bn.mov    w10, w8
   bn.mov    w11, w9
   bn.mov    w16, w27
   bn.mov    w17, w28
   jal       x1, p384_mulmod_p
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 36: [w5, w4] = t2 <= t0*Y3 = [w1, w0]*[w28, w27] */
   bn.mov    w10, w0
   bn.mov    w11, w1
   bn.mov    w16, w27
   bn.mov    w17, w28
   jal       x1, p384_mulmod_p
   bn.mov    w4, w16
   bn.mov    w5, w17

   /* 37: [w28, w27] = Y3 <= X3*Z3 = [w26, w25]*[w30, w29] */
   bn.mov    w10, w25
   bn.mov    w11, w26
   bn.mov    w16, w29
   bn.mov    w17, w30
   jal       x1, p384_mulmod_p
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 38: [w28, w27] = Y3 <= Y3+t2 = [w28, w27]+[w5, w4] */
   bn.add    w16, w27, w4
   bn.addc   w17, w28, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w27, w16
   bn.mov    w28, w17

   /* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */
   bn.mov    w10, w6
   bn.mov    w11, w7
   bn.mov    w16, w25
   bn.mov    w17, w26
   jal       x1, p384_mulmod_p
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 40: [w26, w25] = X3 <= X3-t1 = [w26, w25]-[w3, w2] */
   bn.sub    w16, w25, w2
   bn.subb   w17, w26, w3
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
   bn.sel    w16, w10, w16, C
   bn.sel    w17, w11, w17, C
   bn.mov    w25, w16
   bn.mov    w26, w17

   /* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */
   bn.mov    w10, w8
   bn.mov    w11, w9
   bn.mov    w16, w29
   bn.mov    w17, w30
   jal       x1, p384_mulmod_p
   bn.mov    w29, w16
   bn.mov    w30, w17

   /* 42: [w3, w2] = t1 <= t3*t0 = [w7, w6]*[w1, w0] */
   bn.mov    w10, w6
   bn.mov    w11, w7
   bn.mov    w16, w0
   bn.mov    w17, w1
   jal       x1, p384_mulmod_p
   bn.mov    w2, w16
   bn.mov    w3, w17

   /* 43: [w30, w29] = Z3 <= Z3+t1 = [w30, w29]+[w3, w2] */
   bn.add    w16, w29, w2
   bn.addc   w17, w30, w3
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
   bn.sel    w16, w16, w10, C
   bn.sel    w17, w17, w11, C
   bn.mov    w29, w16
   bn.mov    w30, w17

   ret


 .section .data

 /* P-384 domain parameter b */
 .globl p384_b
 p384_b:
   .word 0xd3ec2aef
   .word 0x2a85c8ed
   .word 0x8a2ed19d
   .word 0xc656398d
   .word 0x5013875a
   .word 0x0314088f
   .word 0xfe814112
   .word 0x181d9c6e
   .word 0xe3f82d19
   .word 0x988e056b
   .word 0xe23ee7e4
   .word 0xb3312fa7
   .zero 16

 /* P-384 domain parameter p (modulus) */
 .globl p384_p
 p384_p:
   .word 0xffffffff
   .word 0x00000000
   .word 0x00000000
   .word 0xffffffff
   .word 0xfffffffe
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .zero 16

 /* P-384 domain parameter n (order of base point) */
 .globl p384_n
 p384_n:
   .word 0xccc52973
   .word 0xecec196a
   .word 0x48b0a77a
   .word 0x581a0db2
   .word 0xf4372ddf
   .word 0xc7634d81
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .word 0xffffffff
   .zero 16

 /* P-384 basepoint G affine x-coordinate */
 .globl p384_gx
 p384_gx:
   .word 0x72760ab7
   .word 0x3a545e38
   .word 0xbf55296c
   .word 0x5502f25d
   .word 0x82542a38
   .word 0x59f741e0
   .word 0x8ba79b98
   .word 0x6e1d3b62
   .word 0xf320ad74
   .word 0x8eb1c71e
   .word 0xbe8b0537
   .word 0xaa87ca22
   .zero 16

 /* P-384 basepoint G affine y-coordinate */
 .globl p384_gy
 p384_gy:
   .word 0x90ea0e5f
   .word 0x7a431d7c
   .word 0x1d7e819d
   .word 0x0a60b1ce
   .word 0xb5f0b8c0
   .word 0xe9da3113
   .word 0x289a147c
   .word 0xf8f41dbd
   .word 0x9292dc29
   .word 0x5d9e98bf
   .word 0x96262c6f
   .word 0x3617de4a
   .zero 16