sw/otbn/code-snippets/p256.s - 3p/lowrisc/opentitan - Git at Google

 /* Copyright lowRISC Contributors.
  * Copyright 2016 The Chromium OS Authors. All rights reserved.
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE.dcrypto file.
  *
  * Derived from code in
  * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c
  */

 .globl p256_init
 .globl p256_isoncurve
 .globl p256_scalarmult
 .globl p256_scalar_base_mult
 .globl p256_sign
 .globl p256_verify
 .globl proj_add

 .text

 /**
  * Initialize field modulus and corresponding Barrett constant for P-256
  *
  * This routine runs in constant time.
  *
  * Flags: This routine does not modify any flag group
  *
  * @param[out]  w29: p, modulus of P-256 underlying finite field
  * @param[out]  w28: u, lower 256 bit of Barrett constant for curve P-256
  * @param[out]  MOD: p, modulus of P-256 underlying finite field
  *
  * clobbered registers: x2, w28, w29
  * clobbered flag groups: none
  */
 setup_barrett_p:
   /* load field modulus p =
      w29 = p = 2^256 - 2^224 + 2^192 + 2^96 - 1
      0xFFFFFFFF00000001000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFF */
   sw        x0, 524(x0)
   sw        x0, 528(x0)
   sw        x0, 532(x0)
   addi      x2, x0, 1
   sw        x2, 536(x0)
   addi      x2, x0, -1
   sw        x2, 512(x0)
   sw        x2, 516(x0)
   sw        x2, 520(x0)
   sw        x2, 540(x0)
   addi      x2, x0, 29
   bn.lid    x2, 512(x0)

   /* store modulus to MOD WSR */
   bn.wsrw   0, w29

   /* load Barrett constant u (lower 256 bit only)
      w28 = u = floor(2^512/p)[255:0] =
      0X00000000FFFFFFFFFFFFFFFEFFFFFFFEFFFFFFFEFFFFFFFF0000000000000003*/
   sw        x0, 548(x0)
   sw        x0, 572(x0)
   addi      x2, x0, -1
   sw        x2, 552(x0)
   sw        x2, 568(x0)
   addi      x2, x0, 3
   sw        x2, 544(x0)
   addi      x2, x0, -2
   sw        x2, 556(x0)
   sw        x2, 560(x0)
   sw        x2, 564(x0)
   addi      x2, x0, 28
   bn.lid    x2, 544(x0)

   ret


 /**
  * Initialize curve P-256 (secp256r1)
  *
  * Setup the context for the P-256 curve:
  *  - load domain parameter b
  *  - load P-256 underlying field modulus p and corresponding Barrett constant
  *  - prepare all-zero-reg and always-one-reg
  *
  * Note that the domain parameter 'a' of P-256 can be written as a=-3.
  * Therefore, parameter 'a' is not provided in a register but has to be
  * incorporated in the algorithms of the individual subroutines.
  * This routine runs in constant time.
  *
  * Flags: This routine does not modify any flag group
  *
  * @param[out]  w27: b, domain parameter b of P-256
  * @param[out]  w28: u, lower 256 bit of Barrett constant for curve P-256
  * @param[out]  w29: p, modulus of P-256 underlying finite field
  * @param[out]  w30: constant 1
  * @param[out]  w31: all-zero
  *
  * clobbered registers: x2, x3, w27, w28, w29, w30, w31
  * clobbered flag groups: none
  */
 p256_init:
   addi      x3, x0, 31
   bn.lid    x3, 0(x0)

   /* init all-zero register */
   bn.xor    w31, w31, w31

   /* init reg with constant 1 */
   bn.addi   w30, w31, 1

   /* setup modulus and corresponding Barrett constant */
   jal       x1, setup_barrett_p

   /* load domain parameter b via dmem
      w27 = b
      b = 0X5AC635D8AA3A93E7B3EBBD55769886BC651D06B0CC53B0F63BCE3C3E27D2604B */
   lui       x2, 163110
   addi      x2, x2, 75
   sw        x2, 576(x0)
   lui       x2, 244964
   addi      x2, x2, -962
   sw        x2, 580(x0)
   lui       x2, 836923
   addi      x2, x2, 246
   sw        x2, 584(x0)
   lui       x2, 414160
   addi      x2, x2, 1712
   sw        x2, 588(x0)
   lui       x2, 485768
   addi      x2, x2, 1724
   sw        x2, 592(x0)
   lui       x2, 736956
   addi      x2, x2, -683
   sw        x2, 596(x0)
   lui       x2, 697257
   addi      x2, x2, 999
   sw        x2, 600(x0)
   lui       x2, 371811
   addi      x2, x2, 1496
   sw        x2, 604(x0)
   addi      x2, x0, 27
   bn.lid    x2, 576(x0)

   ret


 /**
  * 256-bit modular multiplication based on Barrett reduction algorithm.
  *
  * Returns c = a * b mod p
  *
  * Expects two 256 bit operands, 256 bit modulus and pre-computed parameter u
  * for Barrett reduction (usually greek mu in literature). u is expected
  * without the leading 1 at bit 256. u has to be pre-computed as
  * u = floor(2^512/p).
  * This guarantees that u > 2^256, however, in order for u to be at
  * most 2^257-1, it has to be ensured that p >= 2^255 + 1.
  *
  * This implementation mostly follows the description in the
  * "Handbook of Applied Cryptography" in Algorithm 14.42.
  * Differences:
  *   - This implementation incorporates a multiplication before the reduction.
  *     Therefore it expects two operands (a, b) instead of a wider integer x.
  *   - The computation of q2 ignores the MSbs of q1 and u to allow using
  *     a 256x256 bit multiplication. This is compensated later by
  *     individual (conditional) additions.
  *   - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
  *     are not implemented here and the full register width is used. This
  *     allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
  *   - There is only a single conditional subtraction of the modulus at the end
  *
  * Note that this implementation is targeted and tested to be used with modulus
  * and Barrett constant of the P-256 underlying finite field only. For a
  * generic modulus a 2nd conditional subtraction of the modulus has to be
  * added or the modulus has to be in a range such that it can be mathematically
  * proven that a single subtraction is sufficient.
  *
  * Flags: Flags when leaving this subroutine depend on a potentially discarded
  *        value and therefore are not usable after return.
  *
  * @param[in]  w24: a, first 256 bit operand (a < p)
  * @param[in]  w25: b, second 256 bit operand with (b < p)
  * @param[in]  w29: p, modulus of P-256 underlying finite field
  * @param[in]  w28: u, lower 256 bit of Barrett constant for curve P-256
  * @param[in]  w31: all-zero
  * @param[in]  MOD: p, modulus of P-256 underlying finite field
  * @param[out]  w19: c, result
  *
  * clobbered registers: w19, w20, w21, w22, w23, w24, w25
  * clobbered flag groups: FG0
  */
 mod_mul_256x256:
   /* Compute the integer product of the operands x = a * b
      x = [w20, w19] = a * b = w24 * w25
      => max. length x: 512 bit */
   bn.mulqacc.z          w24.0, w25.0,  0
   bn.mulqacc            w24.1, w25.0, 64
   bn.mulqacc.so  w19.L, w24.0, w25.1, 64
   bn.mulqacc            w24.2, w25.0,  0
   bn.mulqacc            w24.1, w25.1,  0
   bn.mulqacc            w24.0, w25.2,  0
   bn.mulqacc            w24.3, w25.0, 64
   bn.mulqacc            w24.2, w25.1, 64
   bn.mulqacc            w24.1, w25.2, 64
   bn.mulqacc.so  w19.U, w24.0, w25.3, 64
   bn.mulqacc            w24.3, w25.1,  0
   bn.mulqacc            w24.2, w25.2,  0
   bn.mulqacc            w24.1, w25.3,  0
   bn.mulqacc            w24.3, w25.2, 64
   bn.mulqacc.so  w20.L, w24.2, w25.3, 64
   bn.mulqacc.so  w20.U, w24.3, w25.3,  0
   bn.add    w20, w20, w31

   /* Store correction factor to compensate for later neglected MSb of x.
      x is 512 bit wide and therefore the 255 bit right shifted version q1
      (below) contains 257 bit. Bit 256 of q1 is neglected to allow using a
      256x256 multiplier. For the MSb of x being set we temporary store u
      (or zero) here to be used in a later constant time correction of a
      multiplication with u. Note that this requires the MSb flag being carried
      over from the multiplication routine. */
   bn.sel    w22, w28, w31, M

   /* Compute q1' = q1[255:0] = x >> 255
      w21 = q1' = [w20, w19] >> 255 */
   bn.rshi   w21, w20, w19 >> 255

   /* Compute q2 = q1*u
      Instead of full q2 (which would be up to 514 bits) we ignore the MSb of u
      and the MSb of q1 and correct this later. This allows using a 256x256
      multiplier.
      => max. length q2': 512 bit
      q2' = q1[255:0]*u[255:0] = [w20, w19] = w21 * w28 */
   bn.mulqacc.z          w21.0, w28.0,  0
   bn.mulqacc            w21.1, w28.0, 64
   bn.mulqacc.so  w23.L, w21.0, w28.1, 64
   bn.mulqacc            w21.2, w28.0,  0
   bn.mulqacc            w21.1, w28.1,  0
   bn.mulqacc            w21.0, w28.2,  0
   bn.mulqacc            w21.3, w28.0, 64
   bn.mulqacc            w21.2, w28.1, 64
   bn.mulqacc            w21.1, w28.2, 64
   bn.mulqacc.so  w23.U, w21.0, w28.3, 64
   bn.mulqacc            w21.3, w28.1,  0
   bn.mulqacc            w21.2, w28.2,  0
   bn.mulqacc            w21.1, w28.3,  0
   bn.mulqacc            w21.3, w28.2, 64
   bn.mulqacc.so  w24.L, w21.2, w28.3, 64
   bn.mulqacc.so  w24.U, w21.3, w28.3,  0
   bn.add    w20, w20, w31

   /* q3 = q2 >> 257
      In this step, the compensation for the neglected MSbs of q1 and u is
      carried out underway. To add them in the q2 domain, they would have to be
      left shifted by 256 bit first. To directly add them we first shift q2' by
      256 bit to the right (by dropping the lower 256 bit), perform the
      additions, and shift the result another bit to the right.

   /* w25 = q1[256] = x[511:256] >> 255 */
   bn.rshi   w25, w31, w20 >> 255

   /* compensate for neglected MSB of u, by adding full q1
      this is unconditional since MSB of u is always 1
      [w25, w24] = q2'' <= q2'[511:256] + q1 = [w25, w24] <= w24 + [w25, w21] */
   bn.add    w24, w24, w21
   bn.addc   w25, w25, w31

   /* compensate for neglected MSB of q1, by conditionally adding u */
   /* [w25, w24] = q2''' <= q2'' + [0 or u] = [w25, w24] + w22 */
   bn.add    w24, w24, w22
   bn.addc   w25, w25, w31

   /* q3 = w21 = q2''' >> 1 */
   bn.rshi   w21, w25, w24 >> 1

   /* [w23, w22] <= q3 * p  */
   bn.mulqacc.z          w29.0, w21.0,  0
   bn.mulqacc            w29.1, w21.0, 64
   bn.mulqacc.so  w22.L, w29.0, w21.1, 64
   bn.mulqacc            w29.2, w21.0,  0
   bn.mulqacc            w29.1, w21.1,  0
   bn.mulqacc            w29.0, w21.2,  0
   bn.mulqacc            w29.3, w21.0, 64
   bn.mulqacc            w29.2, w21.1, 64
   bn.mulqacc            w29.1, w21.2, 64
   bn.mulqacc.so  w22.U, w29.0, w21.3, 64
   bn.mulqacc            w29.3, w21.1,  0
   bn.mulqacc            w29.2, w21.2,  0
   bn.mulqacc            w29.1, w21.3,  0
   bn.mulqacc            w29.3, w21.2, 64
   bn.mulqacc.so  w23.L, w29.2, w21.3, 64
   bn.mulqacc.so  w23.U, w29.3, w21.3,  0
   bn.add    w23, w23, w31

   /* We compute the final remainder r by subtracting the estimate q3 from x.
      In the generic algorithm, r is already the reduced result or it is off by
      either p or 2p. For the special case of the modulus of P-256 it can be
      shown that r can only be off by max p. Therefore, only a single
      conditional correction is required.
      [w20, w22] = r <= [w20, w19] - [w23, w22] = x - q3*p  */
   bn.sub    w22, w19, w22
   bn.subb   w20, w20, w23

   /* r cannot be wider than 257 bit. Therefore it is r > p if bit 256 of r is
      set and we need to subtract the modulus */
   bn.sel    w21, w29, w31, L
   bn.sub    w21, w22, w21

   /* this performs the correction in case r is only 256 bit long but still
      greater than the modulus */
   bn.addm   w19, w21, w31

   ret


 /**
  * Checks if a point is a valid curve point on curve P-256 (secp256r1)
  *
  * Returns r = x^3 + ax + b  mod p
  *     and s = y^2  mod p
  *         with x,y being the affine coordinates of the curve point
  *              a, b and p being the domain parameters of P-256
  *
  * This routine checks if a point with given x- and y-coordinate is a valid
  * curve point on P-256.
  * The routine checks whether the coordinates are a solution of the
  * Weierstrass equation y^2 = x^3 + ax + b  mod p.
  * The routine makes use of the property that the domain parameter 'a' can be
  * written as a=-3 for the P-256 curve, hence the routine is limited to P-256.
  * The routine does not return a boolean result but computes the left side
  * and the right sight of the Weierstrass equation and leaves the final
  * comparison to the caller.
  * The routine runs in constant time.
  *
  * Flags: When leaving this subroutine, flags of FG0 depend on an
  *        intermediate result and are not usable after return.
  *        FG1 is not modified in this subroutine.
  *
  * @param[in]  dmem[12]: dptr_r, pointer to dmem location where right
  *                               side result r will be stored
  * @param[in]  dmem[16]: dptr_s, pointer to dmem location where left side
  *                               result s will be stored
  * @param[in]  dmem[20]: dptr_x, pointer to dmem location containing affine
  *                               x-coordinate of input point
  * @param[in]  dmem[24]: dptr_y, pointer to dmem location containing affine
  *                               y-coordinate of input point
  * @param[in]  w27: b, curve domain parameter
  * @param[in]  w29: p, modulus, 2^256 > p > 2^255.
  * @param[in]  w28: u, pre-computed Barrett constant (without u[256]/MSb
  *                       of u which is always 1 for the allowed range.
  * @param[in]  w31: all-zero.
  * @param[in]  MOD: p, modulus, 2^256 > p > 2^255.
  *
  * clobbered registers: x8, x13, x14, x19, x20, x21, w0, w19 to w25
  * clobbered flag groups: FG0
  */
 p256_isoncurve:

   addi      x3, x0, 0
   bn.lid    x3, 0(x0)

   lw        x16, 0(x0)
   lw        x17, 4(x0)
   lw        x18, 8(x0)

   /* load dmem pointer to signature r from dmem: x19 <= dptr_r = dmem[12] */
   lw        x19, 12(x0)

   /* load dmem pointer to signature s from dmem, x20 <= dptr_s = dmem[16] */
   lw        x20, 16(x0)

   /* load dmem pointer to affine x-coordinate of curve point from dmem:
      x21 <= dptr_x = dmem[20] */
   lw        x21, 20(x0)

   /* load dmem pointer to affine y-coordinate of curve point from dmem:
      x22 <= dptr_y = dmem[24] */
   lw        x22, 24(x0)

   lw        x23, 28(x0)

   /* setup pointers to WREGs */
   addi      x8, x0, 0

   lw        x9, 4(x0)
   lw        x10, 8(x0)
   lw        x11, 12(x0)
   lw        x12, 16(x0)

   /* setup pointers to WREGs */
   addi      x13, x0, 24
   addi      x14, x0, 24

   lw        x15, 28(x0)

   /* load y-coordinate from dmem
      w24 = y = dmem[dptr_y] = dmem[x22]] */
   bn.lid    x14, 0(x22)

   /* w0 = w19 = y^2 = w24*w24 */
   bn.mov    w25, w24
   jal       x1, mod_mul_256x256
   bn.mov    w0, w19

   /* load x-coordinate from dmem
      w24 = x = dmem[dptr_x] = dmem[x21] */
   bn.lid    x13, 0(x21)

   /* w19 = x^2 = w24*w24 */
   bn.mov    w25, w24
   jal       x1, mod_mul_256x256

   /* reload x-coordinate from dmem */
   bn.lid    x13, 0(x21)

   /* w19 = x^3 = x^2 * x = w25*w24 = w19*w24 */
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* reload x-coordinate from dmem */
   bn.lid    x13, 0(x21)

   /* for curve P-256, 'a' can be written as a = -3, therefore we subtract
      x three times from x^3.
      w19 = x^3 + ax = x^3 - 3x  mod p */
   bn.subm   w19, w19, w24
   bn.subm   w19, w19, w24
   bn.subm   w19, w19, w24

   /* w24 = x^3 + ax + b = w19 + w27  mod p */
   bn.addm   w24, w19, w27

   /* store results in dmem */
   /* dmem[dptr_r] = w24 = x^3 + ax + b  mod p */
   bn.sid    x13, 0(x19)

   /* dmem[dptr_s] = w0 = y^2  mod p */
   bn.sid    x8, 0(x20)

   ret


 /**
  * P-256 point addition in projective coordinates
  *
  * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q)
  *         with R, P and Q being valid P-256 curve points
  *           in projective coordinates
  *
  * This routine adds two valid P-256 curve points in projective space.
  * Point addition is performed based on the complete formulas of Bosma and
  * Lenstra for Weierstrass curves as first published in [1] and
  * optimized in [2].
  * The implemented version follows Algorithm 4 of [2] which is an optimized
  * variant for Weierstrass curves with domain parameter 'a' set to a=-3.
  * Numbering of the steps below and naming of symbols follows the
  * terminology of Algorithm 4 of [2].
  * The routine is limited to P-256 curve points due to:
  *   - fixed a=-3 domain parameter
  *   - usage of a P-256 optimized Barrett multiplication kernel
  * This routine runs in constant time.
  *
  * [1] https://doi.org/10.1006/jnth.1995.1088
  * [2] https://doi.org/10.1007/978-3-662-49890-3_16
  *
  * @param[in]  w8: x_p, x-coordinate of input point P
  * @param[in]  w9: y_p, y-coordinate of input point P
  * @param[in]  w10: z_p, z-coordinate of input point P
  * @param[in]  w11: x_q, x-coordinate of input point Q
  * @param[in]  w12: y_q, x-coordinate of input point Q
  * @param[in]  w13: z_q, x-coordinate of input point Q
  * @param[in]  w27: b, curve domain parameter
  * @param[in]  w29: p, modulus, 2^256 > p > 2^255.
  * @param[in]  w28: u, pre-computed Barrett constant (without u[256]/MSb
  *                           of u which is always 1 for the allowed range.
  * @param[in]  w31: all-zero.
  * @param[in]  MOD: p, modulus, 2^256 > p > 2^255.
  * @param[out]  w11: x_r, x-coordinate of resulting point R
  * @param[out]  w12: y_r, x-coordinate of resulting point R
  * @param[out]  w13: z_r, x-coordinate of resulting point R
  *
  * Flags: When leaving this subroutine, flags of FG0 depend on an
  *        intermediate result and are not usable after return.
  *        FG1 is not modified in this subroutine.
  *
  * clobbered registers: w11 to w25
  * clobbered flag groups: FG0
  */
 proj_add:
   /* mapping of parameters to symbols of [2] (Algorithm 4):
      X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q
      X3 = x_r; Y3 = y_r; Z3 = z_r */

   /* 1: w14 = t0 <= X1*X2 = w11*w8 */
   bn.mov    w24, w11
   bn.mov    w25, w8
   jal       x1, mod_mul_256x256
   bn.mov    w14, w19

   /* 2: w15 = t1 <= Y1*Y2 = w12*w9 */
   bn.mov    w24, w12
   bn.mov    w25, w9
   jal       x1, mod_mul_256x256
   bn.mov    w15, w19

   /* 3: w16 = t2 <= Z1*Z2 = w13*w10*/
   bn.mov    w24, w13
   bn.mov    w25, w10
   jal       x1, mod_mul_256x256
   bn.mov    w16, w19

   /* 5: w17 = t4 <= X2+Y2 = w11 + w12 */
   bn.addm   w17, w11, w12

   /* 4: w18 = t3 <= X1+Y1 = w8+w9 */
   bn.addm   w18, w8, w9

   /* 6: w19 = t3 <= t3*t4 = w18*w17 */
   bn.mov    w24, w17
   bn.mov    w25, w18
   jal       x1, mod_mul_256x256

   /* 7: w18 = t4 <= t0+t1 = w14+w15 */
   bn.addm   w18, w14, w15

   /* 8: w17 = t3 <= t3 - t4 = w19 - w18 */
   bn.subm   w17, w19, w18

   /* 10: w18 = X3 <= Y2 + Z2 = w12 + w13 */
   bn.addm   w18, w12, w13

   /* 9: w19 = t4 <= Y1 + Z1 = w9 + w10 */
   bn.addm   w19, w9, w10

   /* 11: w18 = t4 <= t4 * X3 = w19 * w18 */
   bn.mov    w24, w18
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256
   bn.mov    w18, w19

   /* 12: w19 = X3 <= t1 + t2 = w15 + w16 */
   bn.addm   w19, w15, w16

   /* 13: w18 = t4 <= t4 - X3 = w18 + w19 */
   bn.subm   w18, w18, w19

   /* 15: w19 = Y3 <= X2 + Z2 = w11 + w13 */
   bn.addm   w19, w11, w13

   /* 14: w12 = X3 <= X1 + Z1 = w8 + w10 */
   bn.addm   w12, w8, w10

   /* 16: w11 = X3 <= X3 * Y3 = w12 * w19 */
   bn.mov    w24, w19
   bn.mov    w25, w12
   jal       x1, mod_mul_256x256
   bn.mov    w11, w19

   /* 17: w12 = Y3 <= t0 + t2 = w14 + w16 */
   bn.addm   w12, w14, w16

   /* 18: w12 = Y3 <= X3 - Y3 = w11 - w12 */
   bn.subm   w12, w11, w12

   /* 19: w19 = Z3 <= b * t2 =  w27 * w16 */
   bn.mov    w24, w27
   bn.mov    w25, w16
   jal       x1, mod_mul_256x256

   /* 20: w11 = X3 <= Y3 -Z3 = w12 - w19 */
   bn.subm   w11, w12, w19

   /* 21: w13 = Z3 <= X3 + X3 = w11 + w11 */
   bn.addm   w13, w11, w11

   /* 22: w11 = X3 <= w11 + w13 = X3 + Z3 */
   bn.addm   w11, w11, w13

   /* 23: w13 = Z3 <= t1 - X3 = w15 - w11 */
   bn.subm   w13, w15, w11

   /* 24: w11 = X3 <= t1 + X3 = w15 + w11 */
   bn.addm   w11, w15, w11

   /* 25: w19 = Y3 <= w27 * w12 = b * Y3 */
   bn.mov    w24, w27
   bn.mov    w25, w12
   jal       x1, mod_mul_256x256

   /* 26: w15 = t1 <= t2 + t2 = w16 + w16 */
   bn.addm   w15, w16, w16

   /* 27: w16 = t2 <= t1 + t2 = w15 + w16 */
   bn.addm   w16, w15, w16

   /* 28: w12 = Y3 <= Y3 - t2 = w19 - w16 */
   bn.subm   w12, w19, w16

   /* 29: w12 = Y3 <= Y3 - t0 = w12 - w14 */
   bn.subm   w12, w12, w14

   /* 30: w15 = t1 <= Y3 + Y3 = w12 + w12 */
   bn.addm   w15, w12, w12

   /* 31: w12 = Y3 <= t1 + Y3 = w15 + w12*/
   bn.addm   w12, w15, w12

   /* 32: w15 = t1 <= t0 + t0 = w14 + w14 */
   bn.addm   w15, w14, w14

   /* 33: w14 = t0 <= t1 + t0 = w15 + w14 */
   bn.addm   w14, w15, w14

   /* 34: w14 = t0 <= t0 - t2 = w14 - w16 */
   bn.subm   w14, w14, w16

   /* 35: w15 = t1 <= t4 * Y3 = w18 * w12 */
   bn.mov    w24, w18
   bn.mov    w25, w12
   jal       x1, mod_mul_256x256
   bn.mov    w15, w19

   /* 36: w16 = t2 <= t0 * Y3 = w14 * w12 */
   bn.mov    w24, w14
   bn.mov    w25, w12
   jal       x1, mod_mul_256x256
   bn.mov    w16, w19

   /* 37: w12 = Y3 <= X3 * Z3 = w11 * w13 */
   bn.mov    w24, w11
   bn.mov    w25, w13
   jal       x1, mod_mul_256x256

   /* 38: w12 = Y3 <= Y3 + t2 = w19 + w16 */
   bn.addm   w12, w19, w16

   /* 39: w19 = X3 <= t3 * X3 = w17 * w11 */
   bn.mov    w24, w17
   bn.mov    w25, w11
   jal       x1, mod_mul_256x256

   /* 40: w11 = X3 <= X3 - t1 = w19 - w15 */
   bn.subm   w11, w19, w15

   /* 41: w13 = Z3 <= t4 * Z3 = w18 * w13 */
   bn.mov    w24, w18
   bn.mov    w25, w13
   jal       x1, mod_mul_256x256
   bn.mov    w13, w19

   /* 42: w19 = t1 <= t3 * t0 = w17 * w14 */
   bn.mov    w24, w17
   bn.mov    w25, w14
   jal       x1, mod_mul_256x256

   /* 43: w13 = Z3 <= Z3 + t1 = w13 + w19 */
   bn.addm   w13, w13, w19

   ret


 /**
  * Convert projective coordinates of a P-256 curve point to affine coordinates
  *
  * returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
  *         with P being a valid P-256 curve point
  *              x_a and y_a being the affine coordinates of said curve point
  *              x, y and z being a set of projective coordinates of said point
  *              and p being the modulus of the P-256 underlying finite field.
  *
  * This routine computes the affine coordinates for a set of projective
  * coordinates of a valid P-256 curve point. The routine performs the required
  * divisions by computing the multiplicative modular inverse of the
  * projective z-coordinate in the underlying finite field of the P-256 curve.
  * For inverse computation Fermat's little theorem is used, i.e.
  * we compute z^-1 = z^(p-2) mod p.
  * For exponentiation a 27 step addition chain is used.
  * This addition chain is (presumably?) the shortest addition chain known as
  * of today for the exponent exp = p - 2 = 2^256 -  2^224 + 2^6 + 2^64 - 1 - 2.
  * Origin of the chain can not fully be traced. [1] attributes it to a specific
  * patch to OpenSLL. The same addition chain is used in the OpenSSL and
  * BoringSSL crypto libraries.
  * This routine runs in constant time.
  *
  * [1] https://doi.org/10.1007/s13389-014-0090-x
  *     https://eprint.iacr.org/2013/816.pdf
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the computed affine y-coordinate.
  *
  * @param[in]  w8: x, x-coordinate of curve point (projective)
  * @param[in]  w9: y, y-coordinate of curve point (projective)
  * @param[in]  w10: z, z-coordinate of curve point (projective)
  * @param[in]  w29: p, modulus, 2^256 > p > 2^255.
  * @param[in]  w28: u, pre-computed Barrett constant (without u[256]/MSb
  *                           of u which is always 1 for the allowed range.
  * @param[in]  MOD: p, modulus of the finite field of P-256
  * @param[out]  w11: x_a, x-coordinate of curve point (affine)
  * @param[out]  w12: y_a, y-coordinate of curve point (affine)
  *
  * clobbered registers: w10 to w19, w24, w25
  * clobbered flag groups: FG0
  */
 proj_to_affine:

   /* 1: exp = 0x1 */
   bn.addm   w10, w10, w31

   /* 2: exp = 0x2 = 2*0x1 */
   bn.mov    w24, w10
   bn.mov    w25, w10
   jal       x1, mod_mul_256x256

   /* 3: exp = 0x3 = 0x2+0x1 */
   bn.mov    w24, w19
   bn.mov    w25, w10
   jal       x1, mod_mul_256x256
   bn.mov    w12, w19

   /* 4: exp = 0x6 = 2*0x3 */
   bn.mov    w24, w19
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* 5: exp = 0xc = 2*0x6 */
   bn.mov    w24, w19
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* 6: exp = 0xf = 0xc+0x3 */
   bn.mov    w24, w19
   bn.mov    w25, w12
   jal       x1, mod_mul_256x256
   bn.mov    w13, w19

   /* 7: exp = 0xf0 = 16*0xf */
   loopi     4, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop

   /* 8: exp = 0xff = 0xf0+0xf */
   bn.mov    w24, w19
   bn.mov    w25, w13
   jal       x1, mod_mul_256x256
   bn.mov    w14, w19

   /* 9: exp = 0xff00 = 256*0xff */
   loopi     8, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop

   /* 10: exp = 0xffff = 0xff00+0xff */
   bn.mov    w24, w19
   bn.mov    w25, w14
   jal       x1, mod_mul_256x256
   bn.mov    w15, w19

   /* 11: exp = 0xffff0000 = 2^16*0xffff */
   loopi     16, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop

   /* 12: exp = 0xffffffff = 0xffff0000+0xffff */
   bn.mov    w24, w19
   bn.mov    w25, w15
   jal       x1, mod_mul_256x256
   bn.mov    w16, w19

   /* 13: exp = 0xffffffff00000000 = 2^32*0xffffffff */
   loopi     32, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop
   bn.mov    w17, w19

   /* 14: exp = 0xffffffff00000001 = 0xffffffff00000000+0x1 */
   bn.mov    w24, w10
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* 15: exp =
            0xffffffff00000001000000000000000000000000000000000000000000000000
          = 2^192*0xffffffff00000001 */
   loopi     192, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop
   bn.mov    w18, w19

   /* 16: exp = 0xffffffffffffffff = 0xffffffff00000000+0xffffffff */
   bn.mov    w24, w17
   bn.mov    w25, w16
   jal       x1, mod_mul_256x256

   /* 17: exp = 0xffffffffffffffff0000 = 2^16*0xffffffffffffffff */
   loopi     16, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop

   /* 18: exp = 0xffffffffffffffffffff = 0xffffffffffffffff0000+0xffff */
   bn.mov    w24, w15
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* 19: exp = 0xffffffffffffffffffff00 = 256*0xffffffffffffffffffff */
   loopi     8, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop

   /* 20: exp = 0xffffffffffffffffffffff = 0xffffffffffffffffffff00+0xff */
   bn.mov    w24, w14
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* 21: exp = 0xffffffffffffffffffffff0 = 16*0xffffffffffffffffffffff */
   loopi     4, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop

   /* 22: exp = 0xfffffffffffffffffffffff = 0xffffffffffffffffffffff0+0xf */
   bn.mov    w24, w13
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* 23: exp = 0x3ffffffffffffffffffffffc = 4*0xfffffffffffffffffffffff */
   loopi     2, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop

   /* 24: exp = 0x3fffffffffffffffffffffff = 0x3ffffffffffffffffffffffc+0x3 */
   bn.mov    w24, w12
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* 25: exp = 0xfffffffffffffffffffffffc = 4*0x3fffffffffffffffffffffff */
   loopi     2, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
     jal       x1, mod_mul_256x256
     nop

   /* 26: exp = 0xfffffffffffffffffffffffd = 0xfffffffffffffffffffffffc+0x1 */
   bn.mov    w24, w10
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256

   /* 27: exp = p-2
          = 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffd
          = 0xfffffffffffffffffffffffd
          + 0xffffffff00000001000000000000000000000000000000000000000000000000
      w14 = z^exp = z^(p-2) = z^-1   mod p */
   bn.mov    w24, w19
   bn.mov    w25, w18
   jal       x1, mod_mul_256x256
   bn.mov    w14, w19

   /* convert x-coordinate to affine
      w11 = x_a = x/z = x * z^(-1) = w8 * w14 */
   bn.mov    w24, w8
   bn.mov    w25, w14
   jal       x1, mod_mul_256x256
   bn.mov    w11, w19

   /* convert y-coordinate to affine
      w12 = y_a = y/z = y * z^(-1) = w9 * w14 */
   bn.mov    w24, w9
   bn.mov    w25, w14
   jal       x1, mod_mul_256x256
   bn.mov    w12, w19

   ret


 /**
  * Variable time modular multiplicative inverse computation
  *
  * returns x_inv = x^-1 mod m
  *
  * This routine computes the modular multiplicative inverse for any x < m in
  * the field GF(m).
  * For inverse computation Fermat's little theorem is used, i.e.
  * we compute x^-1 = x^(m-2) mod m.
  * For exponentiation we use a standard, variable time (!) square and multiply
  * algorithm.
  *
  * @param[in]  w0: x, a 256 bit operand with x < m
  * @param[in]  w29: m, modulus, 2^256 > m > 2^255.
  * @param[in]  w28: u, lower 256 bit of pre-computed Barrett constant
  * @param[in]  w30, constant 1
  * @param[in]  w31, all-zero
  * @param[in]  MOD: m, modulus
  * @param[out]  w1: x_inv, modular multiplicative inverse
  *
  * Flags: When leaving this subroutine, flags of FG0 depend on a
  *        potentially discarded intermediate result and are not usable after
  *        return.
  *        FG1 is not modified in this subroutine.
  *
  * clobbered registers: w1, w2, w3, w19, w24, w25
  * clobbered flag groups: FG0
  */
 mod_inv:

   /* subtract 2 from modulus for Fermat's little theorem
      w2 = MOD - 2 = m - 2 */
   bn.wsrr   w2, 0
   bn.subi   w2, w2, 2

   /* init square and multiply: w1 = 1 */
   bn.mov    w1, w30

   /* square and multiply loop */
   loopi     256, 14

     /* square: w3 = w19 = w24*w25 = w1^2  mod m */
     bn.mov    w24, w1
     bn.mov    w25, w1
     jal       x1, mod_mul_256x256
     bn.mov    w3, w19

     /* shift MSB into carry flag
        w2 = 2*w2 = w2 << 1 */
     bn.add    w2, w2, w2

     /* skip multiplication if C flag not set */
     bn.sel    w1, w1, w3, C
     csrrs     x2, 1984, x0
     andi      x2, x2, 1
     beq       x2, x0, nomul

     /* multiply: w1 = w19 = w24*w25 = w3*w0  mod m */
     bn.mov    w24, w3
     bn.mov    w25, w0
     jal       x1, mod_mul_256x256
     bn.mov    w1, w19

     nomul:
     nop

   ret


 /**
  * Fetch curve point from dmem and randomize z-coordinate
  *
  * returns P = (x, y, z) = (x_a*z, y_a*z, z)
  *         with P being a valid P-256 curve point in projective coordinates
  *              x_a and y_a being the affine coordinates as fetched from dmem
  *              z being a randomized z-coordinate
  *
  * This routines fetches the affine x- and y-coordinates of a curve point from
  * dmem and computes a valid set of projective coordinates. The z-coordinate is
  * randomized and x and y are scaled appropriately.
  * This routine runs in constant time.
  *
  * @param[in]  x10: constant 24
  * @param[in]  x21: dptr_x, pointer to dmem location containing affine
  *                          x-coordinate of input point
  * @param[in]  x22: dptr_y, pointer to dmem location containing affine
  *                          y-coordinate of input point
  * @param[in]  w28: u, lower 256 bit of Barrett constant for curve P-256
  * @param[in]  w29: p, modulus of P-256 underlying finite field
  * @param[in]  w31: all-zero
  * @param[in]  MOD: p, modulus of P-256 underlying finite field
  * @param[out]  w26: z, random projective z-coordinate
  * @param[out]  w6: x, projective x-coordinate
  * @param[out]  w7: y, projective y-coordinate
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the scaled projective y-coordinate.
  *
  * clobbered registers: w2, w6, w7, w19, w24, w25, w26
  * clobbered flag groups: FG0
  */
 fetch_proj_randomize:

   /* get random number */
   bn.wsrr   w2, 1

   /* reduce random number
      w26 = z <= w2 mod p */
   bn.addm   w26, w2, w31

   /* fetch x-coordinate from dmem
      w24 = x_a <= dmem[x22] = dmem[dptr_x] */
   bn.lid    x10, 0(x21)

   /* scale x-coordinate
      w6 = x <= w24*w26 = x_a*z  mod p */
   bn.mov    w25, w26
   jal       x1, mod_mul_256x256
   bn.mov    w6, w19

   /* fetch y-coordinate from dmem
      w24 = y_a <= dmem[x22] = dmem[dptr_y] */
   bn.lid    x10, 0(x22)

   /* scale y-coordinate
      w7 = y <= w24*w26 = y_a*z  mod p */
   bn.mov    w25, w26
   jal       x1, mod_mul_256x256
   bn.mov    w7, w19

   ret


 /**
  * P-256 point doubling in projective space
  *
  * returns R = (x_r, y_r, z_r) <= 2*P = 2*(x_p, y_p, z_p)
  *         with R, P being valid P-256 curve points
  *
  * This routines doubles a given P-256 curve point in projective coordinate.
  * Internally this routine makes use of the point addition routine and
  * adds the point to itself.
  * This routine runs in constant time.
  *
  * @param[in]  w8: x_p, x-coordinate of input point
  * @param[in]  w9: y_p, y-coordinate of input point
  * @param[in]  w10: z_p, z-coordinate of input point
  * @param[in]  w27: b, curve domain parameter
  * @param[in]  w29: p, p, modulus of P-256 underlying finite field
  * @param[in]  w28: u, u, lower 256 bit of Barrett constant for curve P-256
  * @param[in]  w31: all-zero.
  * @param[in]  MOD: p, modulus of P-256 underlying finite field
  * @param[out]  w11: x_r, x-coordinate of resulting point
  * @param[out]  w12: y_r, y-coordinate of resulting point
  * @param[out]  w13: z_r, z-coordinate of resulting point
  *
  * Flags: When leaving this subroutine, flags of FG0 depend on an
  *        intermediate result and are not usable after return.
  *        FG1 is not modified in this subroutine.
  *
  * clobbered registers: w11 to w25
  * clobbered flag groups: FG0
  */
 proj_double:

   /* Q = (x_q, y_q, z_q) = (w11, w12, w13) <= P = (x_p, y_p, z_p)
      = (w8, w9, w10) */
   bn.mov    w11, w8
   bn.mov    w12, w9
   bn.mov    w13, w10

   /* R = (x_r, y_r, z_r) = (w11, w12, w13) = P+Q
        = (w8, w9, w10) + (w11, w12, w13) = (x_p, y_p, z_p) + (x_q, y_q, z_q) */
   jal       x1, proj_add

   ret


 /**
  * Setup Barrett parameters for order of base point of P-256
  *
  * Loads n, ther order of the base point G of P-256 in the modulus register.
  * Loads the matching Barrett constant u = floor(2^512/n)[255:0].
  *
  * This routine runs in constant time.
  *
  * Flags: This routine does not modify any flag group
  *
  * @param[out]  w29: n, order of base point G of P-256
  * @param[out]  w28: u_n, lower 256 bit of Barrett constant base point order
  * @param[out]  MOD: n, order of base point G of P-256
  *
  * clobbered registers: x2, w28, w29
  * clobbered flag groups: none
  */
 setup_barrett_n:
   /* load order of base point G of P-256
      w29 = n =
      0XFFFFFFFF00000000FFFFFFFFFFFFFFFFBCE6FAADA7179E84F3B9CAC2FC632551 */
   addi      x2, x0, 0
   sw        x2, 632(x0)
   addi      x2, x2, -1
   sw        x2, 624(x0)
   sw        x2, 628(x0)
   sw        x2, 636(x0)
   lui       x2, 1033778
   addi      x2, x2, 1361
   sw        x2, 608(x0)
   lui       x2, 998301
   addi      x2, x2, -1342
   sw        x2, 612(x0)
   lui       x2, 684410
   addi      x2, x2, -380
   sw        x2, 616(x0)
   lui       x2, 773744
   addi      x2, x2, -1363
   sw        x2, 620(x0)
   addi      x2, x0, 29
   bn.lid    x2, 608(x0)

   /* store n to MOD WSR */
   bn.wsrw   0, w29

   /* load Barrett constant u (lower 256 bit only)
      w28 = u_n = floor(2^512/n)[255:0] =
      0X00000000FFFFFFFFFFFFFFFEFFFFFFFF43190552DF1A6C21012FFD85EEDF9BFE */
   addi      x2, x0, 0
   sw        x2, 668(x0)
   addi      x2, x0, -1
   sw        x2, 656(x0)
   sw        x2, 664(x0)
   addi      x2, x0, -2
   sw        x2, 660(x0)
   lui       x2, 978426
   addi      x2, x2, -1026
   sw        x2, 640(x0)
   lui       x2, 4864
   addi      x2, x2, -635
   sw        x2, 644(x0)
   lui       x2, 913831
   addi      x2, x2, -991
   sw        x2, 648(x0)
   lui       x2, 274832
   addi      x2, x2, 1362
   sw        x2, 652(x0)
   addi      x2, x0, 28
   bn.lid    x2, 640(x0)

   ret


 /**
  * P-256 scalar point multiplication in affine space
  *
  * returns R = k*P = k*(x_p, y_p)
  *         with R, P being valid P-256 curve points in affine coordinates
  *              k being a 256 bit scalar
  *
  * This routines performs scalar multiplication based on the group laws
  * of Weierstrass curves.
  * A constant time double-and-add algorithm (sometimes referred to as
  * double-and-add-always) is used.
  * Due to the P-256 optimized implementations of the called routines for
  * point addition and doubling, this routine is limited to P-256.
  * The routine makes use of blinding by additive splitting the
  * exponent/scalar k into a random number (rnd) and rnd-k. The double-and-add
  * loop operates on both shares in parallel applying Shamir's trick.
  *
  * @param[in]  x9: constant 1
  * @param[in]  x10: constant 24
  * @param[in]  x17: dRnd, pointer to location in dmem containing random number
  *                          to be used for additive splitting of scalar
  * @param[in]  x21: dptr_x, pointer to affine x-coordinate in dmem
  * @param[in]  x22: dptr_y, pointer to affine y-coordinate in dmem
  * @param[in]  w0: k, scalar for multiplication
  * @param[in]  w27: b, curve domain parameter
  * @param[in]  w28: u, pre-computed Barrett constant (without u[256]/MSb
  *                           of u which is always 1 for the allowed range.
  * @param[in]  w29: p, modulus, 2^256 > p > 2^255.
  * @param[in]  w30: constant 1
  * @param[in]  w31: all-zero
  * @param[in]  MOD: p, modulus, 2^256 > p > 2^255.
  * @param[out]  w11: x_r, affine x-coordinate of resulting point
  * @param[out]  w12: y_r, affine y-coordinate of resulting point
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the computed affine y-coordinate.
  *
  * clobbered registers: w0 to w25
  * clobbered flag groups: FG0
  */
 scalar_mult_int:

   /* setup domain parameter n (order of base point G) as the modulus */
   jal       x1, setup_barrett_n

   /* fetch externally supplied random number from dmem
      w1 = dmem[dptr_rnd] = rnd */
   bn.lid    x9, 0(x17)

   /* 1st share (reduced rnd)
      rnd = w1 <= rnd mod n */
   bn.addm   w1, w1, w31

   /* 2nd share (k-rnd)
      w0 = w0 - w1 = k - rnd mod n */
   bn.subm   w0, w0, w1

   /* setup Barrett parameter and modulus for P-256 underlying field */
   jal       x1, setup_barrett_p

   /* get randomized projective coodinates of curve point
      P = (x_p, y_p, z_p) = (w8, w9, w10) = (w6, w7, w26) =
      (x*z mod p, y*z mod p, z) */
   jal       x1, fetch_proj_randomize
   bn.mov    w8, w6
   bn.mov    w9, w7
   bn.mov    w10, w26

   /* Init 2P, this will be used for the addition part in the double-and-add
      loop when the bit at the current index is 1 for both shares of the scalar.
      2P = (w3, w4, w5) <= (w11, w12, w13) <= 2*(w8, w9, w10) = 2*P */
   jal       x1, proj_double
   bn.mov    w3, w11
   bn.mov    w4, w12
   bn.mov    w5, w13

   /* init double-and-add with point in infinity
      Q = (w8, w9, w10) <= (0, 1, 0) */
   bn.mov    w8, w31
   bn.mov    w9, w30
   bn.mov    w10, w31

   /* double-and-add loop with decreasing index */
   loopi     256, 32

     /* double point Q
        Q = (w11, w12, w13) <= 2*(w8, w9, w10) = 2*Q */
     jal       x1, proj_double


     /* re-fetch and randomize P again
        P = (w6, w7, w26) */
     jal       x1, fetch_proj_randomize

     /* probe if MSb of either of the two scalars (rnd or k-rnd) but not both
        is 1.
        If only one MSb is set, select P for addition
        If both MSbs are set, select 2P for addition
        (If neither MSB is set, also 2P will be selected but this will be
         discarded late) */
     bn.xor    w8, w0, w1

     /* P = (w8, w9, w10)
         <= (w0[255] xor w1[256])?P=(w6, w7, w26):2P=(w3, w4, w5) */
     bn.sel    w8, w6, w3, M
     bn.sel    w9, w7, w4, M
     bn.sel    w10, w26, w5, M

     /* save doubling result to survive follow-up subroutine call
        Q = (w2, w6, w7) <= (w11, w12, w13) */
     bn.mov    w2, w11
     bn.mov    w6, w12
     bn.mov    w7, w13

     /* add points
        Q+P = (w11, w12, w13) <= (w11, w12, w13) + (w8, w9, w10) */
     jal       x1, proj_add

     /* probe if MSb of either one or both of the two
        scalars (rnd or k-rnd) is 1.*/
     bn.or     w8, w0, w1

     /* select doubling result (Q) or addition result (Q+P)
        Q = w0[255] or w1[255]?Q_a=(w11, w12, w13):Q=(w2, w6, w7) */
     bn.sel    w8, w11, w2, M
     bn.sel    w9, w12, w6, M
     bn.sel    w10, w13, w7, M

     /* rotate both scalars left 1 bit */
     bn.rshi   w0, w0, w0 >> 255
     bn.rshi   w1, w1, w1 >> 255

     /* init regs with random numbers */
     bn.wsrr   w11, 1
     bn.wsrr   w12, 1
     bn.wsrr   w13, 1

     /* get a fresh random number and scale the coordinates of
        2P = (w3, w4, w5) (scaling each projective coordinate with same
        factor results in same point) */
     bn.wsrr   w2, 1

     /* w3 = w3 * w2 */
     bn.mov    w24, w3
     bn.mov    w25, w2
     jal       x1, mod_mul_256x256
     bn.mov    w3, w19

     /* w4 = w4 * w2 */
     bn.mov    w24, w4
     bn.mov    w25, w2
     jal       x1, mod_mul_256x256
     bn.mov    w4, w19

     /* w5 = w5 * w2 */
     bn.mov    w24, w5
     bn.mov    w25, w2
     jal       x1, mod_mul_256x256
     bn.mov    w5, w19

   /* convert back to affine coordinates
      R = (x_a, y_a) = (w11, w12) */
   jal       x1, proj_to_affine

   ret


 /**
  * Load base points of P-256
  *
  * Flags: This routine does not modify any flag group
  *
  * @param[out]  w8: x_g, base point affine x-coordinate
  * @param[out]  w9: y_g, base point affine y-coordinate
  *
  * clobbered registers: x2, w8, w9
  * clobbered flag groups: none
  */
 load_basepoint:
   /* load base point x-coordinate
      w8 = x_g =
      0X6B17D1F2E12C4247F8BCE6E563A440F277037D812DEB33A0F4A13945D898C296 */
   lui       x2, 887180
   addi      x2, x2, 662
   sw        x2, 672(x0)
   lui       x2, 1002004
   addi      x2, x2, -1723
   sw        x2, 676(x0)
   lui       x2, 188083
   addi      x2, x2, 928
   sw        x2, 680(x0)
   lui       x2, 487480
   addi      x2, x2, -639
   sw        x2, 684(x0)
   lui       x2, 408132
   addi      x2, x2, 242
   sw        x2, 688(x0)
   lui       x2, 1018830
   addi      x2, x2, 1765
   sw        x2, 692(x0)
   lui       x2, 922308
   addi      x2, x2, 583
   sw        x2, 696(x0)
   lui       x2, 438653
   addi      x2, x2, 498
   sw        x2, 700(x0)
   addi      x2, x0, 8
   bn.lid    x2, 672(x0)

   /* load base point y-coordinate
      w9 = y_g =
      0X4FE342E2FE1A7F9B8EE7EB4A7C0F9E162BCE33576B315ECECBB6406837BF51F5 */
   lui       x2, 228341
   addi      x2, x2, 501
   sw        x2, 704(x0)
   lui       x2, 834404
   addi      x2, x2, 104
   sw        x2, 708(x0)
   lui       x2, 439062
   addi      x2, x2, -306
   sw        x2, 712(x0)
   lui       x2, 179427
   addi      x2, x2, 855
   sw        x2, 716(x0)
   lui       x2, 508154
   addi      x2, x2, -490
   sw        x2, 720(x0)
   lui       x2, 585343
   addi      x2, x2, -1206
   sw        x2, 724(x0)
   lui       x2, 1040808
   addi      x2, x2, -101
   sw        x2, 728(x0)
   lui       x2, 327220
   addi      x2, x2, 738
   sw        x2, 732(x0)
   addi      x2, x0, 9
   bn.lid    x2, 704(x0)

   ret


 /**
  * P-256 ECDSA signature generation
  *
  * returns the signature as the pair r, s with
  *         r = x_1  mod n
  *     and s = k^(-1)(msg + r*d)  mod n
  *         with x_1 being the affine x-coordinate of the curve point k*G,
  *                  where G is the curve's base point.
  *              k being a supplied secret random number,
  *              n being the order of the base point G of P-256,
  *              msg being the msg to be signed,
  *              d being the private key.
  *
  * This routine runs in constant time.
  *
  * @param[in]  dmem[0]: dptr_k, pointer to a 256 bit random secret in dmem
  * @param[in]  dmem[4]: dptr_rnd, pointer to location in dmem containing random
  *                        number for blinding
  * @param[in]  dmem[8]: dptr_msg, pointer to the message to be signed in dmem
  * @param[in]  dmem[12]: dptr_r, pointer to dmem location where s component
  *                               of signature will be placed
  * @param[in]  dmem[16]: dptr_s, pointer to dmem location where r component
  *                               of signature will be placed
  * @param[in]  dmem[28]: dptr_d, pointer to private key d in dmem
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the computed affine y-coordinate.
  *
  * clobbered registers: x3, x8 to x23, w0 to w25
  * clobbered flag groups: FG0
  */
 p256_sign:

   addi      x0, x0, 0
   addi      x3, x0, 0
   bn.lid    x3, 0(x0)

   /* load dmem pointer to secret random number k in dmem
      x16 <= dptr_k = dmem[0]*/
   lw        x16, 0(x0)

   /* load dmem pointer to random number for blinding rnd in dmem:
      x17 <= dptr_rnd = dmem[4] */
   lw        x17, 4(x0)

   /* load dmem pointer to message msg in dmem: x18 <= dptr_msg = dmem[8] */
   lw        x18, 8(x0)

   /* load dmem pointer to signature r in dmem: x19 <= dptr_r = dmem[12] */
   lw        x19, 12(x0)

   /* load dmem pointer to signature s in dmem: x20 <= dptr_s = dmem[16] */
   lw        x20, 16(x0)

   /* load dmem pointer to affine x-coordinate in dmem:
      x21 <= dptr_x = dmem[20] */
   lw        x21, 20(x0)

   /* load dmem pointer to affine y-coordinate in dmem:
      x22 <= dptr_y = dmem[24] */
   lw        x22, 24(x0)

   /* load dmem pointer to private key d in dmem: dD = x15 <= dmem[28] */
   lw        x23, 28(x0)

   /* setup pointers into reg file */
   addi      x8, x0, 0
   addi      x9, x0, 1
   addi      x10, x0, 24

   /* setup more dmem pointers */
   lw        x11, 12(x0)

   /* setup pointers into reg file */
   addi      x12, x0, 8
   addi      x13, x0, 9

   /* load dmem pointer to affine y-coordinate in dmem: x14 <= dptr_y = dmem[24] */
   lw        x14, 24(x0)

   /* load dmem pointer to private key d in dmem: x15 <= dptr_d = dmem[28] */
   lw        x15, 28(x0)

   /* load base point G affine coordinates: G = (w8, w9) <= (x_g, y_g) */
   jal       x1, load_basepoint

   /* store affine base point coordinates in dmem
      dmem[dptr_x] = w8; dmem[dptr_y] = w9 */
   bn.sid    x12, 0(x21)
   bn.sid    x13, 0(x22)

   /* load private key d from dmem: w0 = dmem[dptr_d] */
   addi      x0, x0, 0
   bn.lid    x8, 0(x16)

   /* scalar multiplication with base point
      (x_1, y_1) = (w11, w12) <= d*G = w0*(dmem[dptr_x], dmem[dptr_y]) */
   jal       x1, scalar_mult_int

   /* setup modulus n */
   jal       x1, setup_barrett_n

   /* load secret random number k from dmem: w0 <= k = dmem[dptr_k] */
   bn.lid    x8, 0(x16)

   /* modular multiplicative inverse of k
      w1 <= k^-1 mod n */
   jal       x1, mod_inv

   /* w19 = k^-1*d mod n; w24 = d = dmem[dptr_d] */
   bn.lid    x10, 0(x23)
   bn.mov    w25, w1
   jal       x1, mod_mul_256x256

   /* w24 = r <= w11  mod n */
   bn.addm   w24, w11, w31

   /* store r of signature in dmem: dmem[dptr_r] <= r = w25 */
   bn.sid    x10, 0(x19)

   addi      x0, x0, 0

   /* w0 = w19 <= w24*w25 = w24*w19 = r*k^-1*d  mod n */
   bn.mov    w25, w19
   jal       x1, mod_mul_256x256
   bn.mov    w0, w19

   /* load message from dmem: w24 = msg <= dmem[dptr_msg] = dmem[x18] */
   bn.lid    x10, 0(x18)

   /* w19 = k^-1*msg <= w25*w24 = w1*w24  mod n */
   bn.mov    w25, w1
   jal       x1, mod_mul_256x256

   /* w0 = s <= w19 + w0 = d*msg + r*d  mod n */
   bn.addm   w0, w19, w0

   /* store s of signature in dmem: dmem[dptr_s] <= s = w0 */
   bn.sid    x8, 0(x20)

   jal       x1, setup_barrett_p

   ret


 /**
  * P-256 scalar multiplication with base point G
  *
  * returns R = d*G = d*(x_g, y_g)
  *         with R and G being valid P-256 curve points in affine coordinates,
  *              furthermore G being the curves base point,
  *              d being a 256 bit scalar
  *
  * Performs a scalar multiplication of a scalar with the base point G of curve
  * P-256.
  * This routine runs in constant time.
  *
  * @param[in]  dmem[4]: dptr_rnd, pointer to location in dmem containing random
  *                        number for blinding
  * @param[in]  dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
  * @param[in]  dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
  * @param[in]  dmem[28]: dptr_d, pointer to location in dmem containing
  *                               scalar d
  *
  * Flags: When leaving this subroutine, flags of FG0 depend on an
  *        intermediate result and are not usable after return.
  *        FG1 is not modified in this subroutine.
  *
  * clobbered registers: x3, x8 to x23, w0 to w25
  * clobbered flag groups: FG0
  */
 p256_scalar_base_mult:

   addi      x0, x0, 0
   addi      x3, x0, 0
   bn.lid    x3, 0(x0)

   /* setup pointers into dmem */
   lw        x16, 0(x0)
   lw        x17, 4(x0)
   lw        x18, 8(x0)
   lw        x19, 12(x0)
   lw        x20, 16(x0)
   lw        x21, 20(x0)
   lw        x22, 24(x0)
   lw        x23, 28(x0)

   /* setup pointers into reg file */
   addi      x8, x0, 0
   addi      x9, x0, 1
   addi      x10, x0, 24
   addi      x11, x0, 11
   addi      x12, x0, 8
   addi      x13, x0, 9

   /* setup more dmem pointers */
   lw        x14, 24(x0)
   lw        x15, 28(x0)

   bn.lid    x8, 0(x17)

   /* load base point G affine coordinates G = (w8, w9) <= (x_g, y_g) */
   jal       x1, load_basepoint

   /* store affine base point coordinates in dmem
      dmem[dptr_x] = w8; dmem[dptr_y] = w9 */
   bn.sid    x12, 0(x21)
   bn.sid    x13, 0(x22)

   /* load scalar d from dmem w0 = dmem[dptr_d] */
   addi      x0, x0, 0
   bn.lid    x8, 0(x23)

   /* scalar multiplication with base point
      R = (x_r, y_r) = (w11, w12) <= d*G = w0*(dmem[dptr_x], dmem[dptr_y]) */
   jal       x1, scalar_mult_int

   /* store result (affine coordinates) in dmem
      dmem[x21] = dmem[dptr_x] <= x_r = w11
      dmem[x22] = dmem[dptr_y] <= y_r = w12 */
   bn.sid    x11++, 0(x21)
   bn.sid    x11++, 0(x22)

   ret


 /**
  * Variable time modular multiplicative inverse computation
  *
  * Returns c <= a^(-1) mod m
  *         with a being a bigint of length 256 bit with a < m
  *              m being the modulus with a length of 256 bit
  *              c being a 256-bit result
  *
  * This routine implements the computation of the modular multiplicative
  * inverse based on the binary GCD or Stein's algorithm.
  * The implemented variant is based on the
  * "right-shift binary extended GCD" as it is described in section 3.1 of [1]
  * (Algorithm 1).
  * [1] https://doi.org/10.1155/ES/2006/32192
  *
  * Note that this is a variable time implementation. I.e. this routine will
  * show a data dependent timing and execution profile. Only use in situations
  * where a full white-box environment is acceptable.
  *
  * Flags: When leaving this subroutine, flags of FG0 depend on an
  *        intermediate result and are not usable after return.
  *        FG1 is not modified in this subroutine.
  *
  * @param[in]  w0: a, operand
  * @param[in]  MOD: m, modulus
  * @param[in]  w30: constant 1
  * @param[in]  w31: all-zero
  * @param[out]  w1: result c
  *
  * clobbered registers: x2, w2, w3, w4, w7
  * clobbered flag groups: FG0
  */
 mod_inv_var:

   /* w2 = r = 0 */
   bn.mov    w2, w31

   /* w3 = s = 1 */
   bn.mov    w3, w30

   /* w4 = u = MOD */
   bn.wsrr   w4, 0
   bn.wsrr   w7, 0

   /* w5 = v = w0 */
   bn.mov    w5, w0

   ebgcd_loop:
   /* test if u is odd */
   bn.or     w4, w4, w4
   csrrs     x2, 1984, x0
   andi      x2, x2, 4
   bne       x2, x0, ebgcd_u_odd

   /* u is even: */
   /* w4 = u <= u/2 = w4 >> 1 */
   bn.rshi   w4, w31, w4 >> 1

   /* test if r is odd */
   bn.or     w2, w2, w2
   csrrs     x2, 1984, x0
   andi      x2, x2, 4
   bne       x2, x0, ebgcd_r_odd

   /* r is even: */
   /* w2 = r <= r/2 = w2 >> 1 */
   bn.rshi   w2, w31, w2 >> 1
   jal       x0, ebgcd_loop

   ebgcd_r_odd:
   /* w2 = r <= (r + m)/2 = (w2 + w7) >> 1 */
   bn.add    w2, w7, w2
   bn.addc   w6, w31, w31
   bn.rshi   w2, w6, w2 >> 1
   jal       x0, ebgcd_loop

   ebgcd_u_odd:
   /* test if v is odd */
   bn.or     w5, w5, w5
   csrrs     x2, 1984, x0
   andi      x2, x2, 4
   bne       x2, x0, ebgcd_uv_odd

   /* v is even: */
   /* w5 = v <= v/2 = w5 >> 1 */
   bn.rshi   w5, w31, w5 >> 1

   /* test if s is odd */
   bn.or     w3, w3, w3
   csrrs     x2, 1984, x0
   andi      x2, x2, 4
   bne       x2, x0, ebgcd_s_odd

   /* s is even: */
   /* w3 = s <= s/2 = w3 >> 1 */
   bn.rshi   w3, w31, w3 >> 1
   jal       x0, ebgcd_loop

   ebgcd_s_odd:
   /* w3 = s <= (s + m)/2 = (w3 + w7) >> 1 */
   bn.add    w3, w7, w3
   bn.addc   w6, w31, w31
   bn.rshi   w3, w6, w3 >> 1
   jal       x0, ebgcd_loop

   ebgcd_uv_odd:
   /* test if v >= u */
   bn.cmp    w5, w4
   csrrs     x2, 1984, x0
   andi      x2, x2, 1
   beq       x2, x0, ebgcd_v_gte_u

   /* u > v: */
   /* w2 = r <= r - s = w2 - w3; if (r < 0): r <= r + m */
   bn.subm   w2, w2, w3

   /* w4 = u <= u - v = w4 - w5 */
   bn.sub    w4, w4, w5
   jal       x0, ebgcd_loop

   ebgcd_v_gte_u:
   /* w3 = s <= s - r = w3 - w2; if (s < 0) s <= s + m */
   bn.subm   w3, w3, w2

   /* w5 = v <= v - u = w5 - w4 */
   bn.sub    w5, w5, w4

   /* if v > 0 go back to start of loop */
   csrrs     x2, 1984, x0
   andi      x2, x2, 8
   beq       x2, x0, ebgcd_loop

   /* v <= 0: */
   /* if (r > m): w1 = a = r - m = w2 - MOD else: w1 = a = r = w2 */
   bn.addm   w1, w2, w31

   ret


 /**
  * P-256 ECDSA signature verification
  *
  * returns the affine x-coordinate of
  *         (x1, y1) = u1*G + u2*Q
  *         with u1 = z*s^-1 mod n  and  u2 = r*s^-1 mod n
  *         with G being the curve's base point,
  *              z being the message
  *              r, s being the signature
  *              Q being the public key.
  *
  * The routine computes the x1 coordinate and places it in dmem. x1 will be
  * reduced (mod n), however, the final comparison has to be performed on the
  * host side. The signature is valid if x1 == r.
  * This routine runs in variable time.
  *
  * @param[in]  dmem[4]: dptr_x1, pointer to dmem location where the reduced
  *                           affine x1-coordinate will be stored
  * @param[in]  dmem[8]: dptr_msg, pointer to the message to be verified in dmem
  * @param[in]  dmem[12]: dptr_r, pointer to s of signature in dmem
  * @param[in]  dmem[16]: dptr_s, pointer to r of signature in dmem
  * @param[in]  dmem[20]: dptr_x, pointer to x-coordinate of public key in dmem
  * @param[in]  dmem[20]: dptr_y, pointer to y-coordinate of public key in dmem
  *
  * Flags: When leaving this subroutine, flags of FG0 depend on an
  *        intermediate result and are not usable after return.
  *        FG1 is not modified in this subroutine.
  *
  * clobbered registers: x2, x3, x8 to x23, w0 to w25
  * clobbered flag groups: FG0
  */
 p256_verify:

   addi      x3, x0, 6
   bn.lid    x3, 0(x0)

   lw        x16, 0(x0)

   /* load dmem pointer to x1 (result) from dmem: x17 <= dptr_x1 = dmem[4] */
   lw        x17, 4(x0)

   /* load dmem pointer to message from dmem: x18 <= dptr_msg = dmem[8] */
   lw        x18, 8(x0)

   /* load dmem pointer to signature r from dmem: x19 <= dptr_r = dmem[12] */
   lw        x19, 12(x0)

   /* load dmem pointer to signature s from dmem, x20 <= dptr_s = dmem[16] */
   lw        x20, 16(x0)

   /* load dmem pointer to affine x-coordinate of public key from dmem:
      x21 <= dptr_x = dmem[20] */
   lw        x21, 20(x0)

   /* load dmem pointer to affine y-coordinate of public key from dmem:
      x22 <= dptr_y = dmem[24] */
   lw        x22, 24(x0)

   lw        x23, 28(x0)

   /* setup pointers to WREGs */
   addi      x8, x0, 11

   /* load dmem pointer to x1 (result) from dmem: x17 <= dptr_x1 = dmem[4] */
   lw        x9, 4(x0)

   /* setup pointers to WREGs */
   addi      x10, x0, 24
   addi      x11, x0, 24
   addi      x12, x0, 0
   addi      x13, x0, 8
   addi      x14, x0, 9
   addi      x15, x0, 12

   /* load r of signature from dmem: w24 = r = dmem[dptr_r] */
   bn.lid    x11, 0(x19)

   bn.mov    w24, w6
   bn.not    w24, w24

   /* setup modulus n (curve order) and Barrett constant */
   jal       x1, setup_barrett_n

   bn.cmp    w6, w31
   csrrs     x2, 1984, x0
   andi      x2, x2, 8
   bne       x2, x0, fail

   bn.cmp    w6, w29
   csrrs     x2, 1984, x0
   andi      x2, x2, 1
   beq       x2, x0, fail

   /* load s of signature from dmem: w0 = s = dmem[dptr_s] */
   bn.lid    x12, 0(x20)

   /* goto 'fail' if w0 == w31 <=> s == 0 */
   bn.cmp    w0, w31
   csrrs     x2, 1984, x0
   andi      x2, x2, 8
   bne       x2, x0, fail

   /* goto 'fail' if w0 >= w29 <=> s >= n */
   bn.cmp    w0, w29
   csrrs     x2, 1984, x0
   andi      x2, x2, 1
   beq       x2, x0, fail

   /* w1 = s^-1  mod n */
   jal       x1, mod_inv_var

   /* load r of signature from dmem: w24 = r = dmem[dptr_r] */
   bn.lid    x11, 0(x19)

   /* w25 = s^-1 = w1 */
   bn.mov    w25, w1

   /* u2 = w0 = w19 <= w24*w25 = r*s^-1 mod n */
   jal       x1, mod_mul_256x256
   bn.mov    w0, w19

   /* load message, w24 = msg = dmem[dptr_msg] */
   bn.lid    x10, 0(x18)

   /* u1 = w1 = w19 <= w24*w25 = w24*w1 = msg*s^-1 mod n */
   bn.mov    w25, w1
   jal       x1, mod_mul_256x256
   bn.mov    w1, w19

   /* setup modulus p and Barrett constant */
   jal       x1, setup_barrett_p

   /* load public key Q from dmem and use in projective form (set z to 1)
      Q = (w11, w12, w13) = (dmem[dptr_x], dmem[dptr_y], 1) */
   bn.lid    x8, 0(x21)
   bn.lid    x15, 0(x22)
   bn.mov    w13, w30

   /* load base point G and use in projective form (set z to 1)
      G = (w8, w9, w10) = (x_g, y_g, 1) */
   jal       x1, load_basepoint
   bn.mov    w10, w30

   /* The rest of the routine implements a variable time double-and-add
      algorithm. For the signature verification we need to compute the point
      C = (x1, y1) = u_1*G + u_2*Q. This can be done in a single
      double-and-add routine by using Shamir's Trick. */

   /* G+Q = (w3, w4, w5) = (w11, w12, w13) = (w8, w9, w10) + (w11, w12, w13) */
   jal       x1, proj_add
   bn.mov    w3, w11
   bn.mov    w4, w12
   bn.mov    w5, w13

   /* w2 = u_2 & u_0 = w0 & w1*/
   bn.and    w2, w0, w1

   /* init double and add algorithm with (0, 1, 0) */
   bn.mov    w11, w31
   bn.mov    w12, w30
   bn.mov    w13, w31

   /* main loop with dicreasing index i (i=255 downto 0) */
   loopi     256, 30

     /* always double: C <= 2*C */
     bn.mov    w8, w11
     bn.mov    w9, w12
     bn.mov    w10, w13
     jal       x1, proj_add

     /* if either  u_1[i] == 0 or u_2[i] == 0 jump to 'no_both' */
     bn.add    w2, w2, w2
     csrrs     x2, 1984, x0
     andi      x2, x2, 1
     beq       x2, x0, no_both

     /* both bits at current index (u1[i] and u2[i]) are set:
        do C <= C + (P + Q) and jump to end */
     bn.mov    w8, w3
     bn.mov    w9, w4
     bn.mov    w10, w5
     jal       x1, proj_add
     jal       x0, no_q

     /* either u1[i] or u2[i] is set, but not both */
     no_both:

     /* if u2[i] is not set jump to 'no_g' */
     bn.add    w6, w0, w0
     csrrs     x2, 1984, x0
     andi      x2, x2, 1
     beq       x2, x0, no_g

     /* u2[i] is set: do C <= C + Q */
     bn.lid    x13, 0(x21)
     bn.lid    x14, 0(x22)
     bn.mov    w10, w30
     jal       x1, proj_add

     no_g:
     /* if u1[i] is not set jump to 'no_q' */
     bn.add    w6, w1, w1
     csrrs     x2, 1984, x0
     andi      x2, x2, 1
     beq       x2, x0, no_q

     /* u1[i] is set: do C <= C + G */
     jal       x1, load_basepoint
     bn.mov    w10, w30
     jal       x1, proj_add

     no_q:
     /* left shift w0 and w1 to decrease index */
     bn.add    w0, w0, w0
     bn.add    w1, w1, w1

   /* compute inverse of z-coordinate: w1 = z_c^-1  mod p */
   bn.mov    w0, w13
   jal       x1, mod_inv_var

   /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1  mod p */
   bn.mov    w24, w1
   bn.mov    w25, w11
   jal       x1, mod_mul_256x256

   /* final reduction: w24 = x1 <= x1 mod n */
   jal       x1, setup_barrett_n
   bn.subm   w24, w19, w31

   fail:
   /* store affine x-coordinate in dmem: dmem[dptr_x1] = w24 = x1 */
   bn.sid    x11, 0(x17)

   ret


 /**
  * Externally callable wrapper for P-256 scalar point multiplication
  *
  * returns R = k*P = k*(x_p, y_p, z_p)
  *         with R, P being valid P-256 curve points in projective form,
  *              k being a 256 bit scalar.
  *
  * Sets up context and calls internal scalar multiplication routine.
  * This routine runs in constant time.
  *
  * @param[in]  dmem[0]: dK, pointer to location in dmem containing scalar k
  * @param[in]  dmem[4]: dRnd, pointer to location in dmem containing random
  *                        number for blinding
  * @param[in]  dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
  * @param[in]  dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the computed affine y-coordinate.
  *
  * clobbered registers: x3, x8 to x23, w0 to w25
  * clobbered flag groups: FG0
  */
 p256_scalarmult:

   /* w0 = dmem[0] */
   addi      x3, x0, 0
   bn.lid    x3, 0(x0)

   /* setup pointers to dmem */
   /* x16 = dptr_k */
   lw        x16, 0(x0)

   /* x17 = dptr_rnd */
   lw        x17, 4(x0)

   lw        x18, 8(x0)
   lw        x19, 12(x0)
   lw        x20, 16(x0)

   /* x21 = dptr_x */
   lw        x21, 20(x0)

   /* x22 = dptr_y */
   lw        x22, 24(x0)

   lw        x23, 28(x0)

   /* setup pointers to WREGs */
   addi      x8, x0, 0
   addi      x9, x0, 1
   addi      x10, x0, 24
   addi      x11, x0, 11

   /* setup pointers to dmem */
   lw        x12, 16(x0)
   lw        x13, 20(x0)
   lw        x14, 24(x0)
   lw        x15, 28(x0)

   /* load scalar k
      w0 = k = dmem[x16] = dmem[0] */
   bn.lid    x8, 0(x16)

   /* call internal scalar multiplication routine
      R = (x_a, y_a) = (w11, w12) <= k*P = w0*P */
   jal       x1, scalar_mult_int

   /* store result (affine coordinates) in dmem
      dmem[x21] = dmem[dptr_x] <= x_a = w11
      dmem[x22] = dmem[dptr_y] <= y_a = w12 */
   bn.sid    x11++, 0(x21)
   bn.sid    x11++, 0(x22)

   ret