[otbn] split P-384 lib

This splits the P-384 lib into two assembly files:
- a base lib containing the domain parameters and
  routines for point addition
- a sign lib containing routines for signature generation
  and constant-time scalar point multiplication building up
  on the base lib.

This is in preparation having independent P-384 binaries,
one for signature generation and one for signature verification.

Signed-off-by: Felix Miller <felix.miller@gi-de.com>
diff --git a/sw/otbn/code-snippets/p384.s b/sw/otbn/code-snippets/p384.s
deleted file mode 100644
index 06a66c4..0000000
--- a/sw/otbn/code-snippets/p384.s
+++ /dev/null
@@ -1,1827 +0,0 @@
-/* Copyright lowRISC contributors. */
-/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
-/* SPDX-License-Identifier: Apache-2.0 */
-/*
- *   P-384 specific routines
- */
-
- .section .text
-
-
-/**
- * 384-bit modular multiplication based on Barrett reduction algorithm
- * optimized for the special modulus of the NIST P-384 curve.
- *
- * Returns c = a x b % p.
- *
- * Expects: two operands, modulus p and pre-calculated parameter u for barrett
- * reduction (usually greek mu in literature). u is expected without the
- * leading 1 at bit 384. u has to be pre-calculated as u = floor(2^768/p).
- *
- * This implementation mostly follows the description in the
- * "Handbook of Applied Cryptography" in Algorithm 14.42.
- * Differences:
- *   - This implementation incorporates a multiplication before the reduction.
- *     Therefore it expects two operands (a, b) instead of a wider integer x.
- *   - The computation of q2 ignores the MSbs of q1 and u to allow using
- *     a 384x384 bit multiplication. This is compensated later by
- *     individual (conditional) additions.
- *   - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
- *     are not implemented here and the full register width is used. This
- *     allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in] [w11, w10]: a, first operand, max. length 384 bit, a < m.
- * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
- * @param[in] [w13, w12]: p, modulus of P384 i.e.:
-                             m = 2^384 - 2^128 - 2^96 + 2^32 - 1.
- * @param[in] [w15, w14]: u, pre-computed Barrett constant (without u[384]/MSb
- *                           of u which is always 1 for the allowed range but
- *                           has to be set to 0 here).
- * @param[in] w31: all-zero.
- * @param[out] [w17, w16]: c, result, max. length 384 bit.
- *
- * Clobbered registers: w10, w11, w16, w17, w18, w19, w20, w21, w22, w23, w24
- * Clobbered flag groups: FG0
- */
- .globl barrett384_p384
-barrett384_p384:
-  /* Compute the integer product of the operands x = a * b
-     x = [w18, w22, w21] = a * b = [w11, w10] * [w17, w16]
-     => max. length x: 768 bit */
-  bn.mulqacc.z          w10.0, w16.0,   0
-  bn.mulqacc            w10.0, w16.1,  64
-  bn.mulqacc.so w21.L,  w10.1, w16.0,  64
-  bn.mulqacc            w10.0, w16.2,   0
-  bn.mulqacc            w10.1, w16.1,   0
-  bn.mulqacc            w10.2, w16.0,   0
-  bn.mulqacc            w10.0, w16.3,  64
-  bn.mulqacc            w10.1, w16.2,  64
-  bn.mulqacc            w10.2, w16.1,  64
-  bn.mulqacc.so w21.U,  w10.3, w16.0,  64
-  bn.mulqacc            w10.0, w17.0,   0
-  bn.mulqacc            w10.1, w16.3,   0
-  bn.mulqacc            w10.2, w16.2,   0
-  bn.mulqacc            w10.3, w16.1,   0
-  bn.mulqacc            w11.0, w16.0,   0
-  bn.mulqacc            w10.0, w17.1,  64
-  bn.mulqacc            w10.1, w17.0,  64
-  bn.mulqacc            w10.2, w16.3,  64
-  bn.mulqacc            w10.3, w16.2,  64
-  bn.mulqacc            w11.0, w16.1,  64
-  bn.mulqacc.so w22.L,  w11.1, w16.0,  64
-  bn.mulqacc            w10.1, w17.1,   0
-  bn.mulqacc            w10.2, w17.0,   0
-  bn.mulqacc            w10.3, w16.3,   0
-  bn.mulqacc            w11.0, w16.2,   0
-  bn.mulqacc            w11.1, w16.1,   0
-  bn.mulqacc            w10.2, w17.1,  64
-  bn.mulqacc            w10.3, w17.0,  64
-  bn.mulqacc            w11.0, w16.3,  64
-  bn.mulqacc.so w22.U,  w11.1, w16.2,  64
-  bn.mulqacc            w10.3, w17.1,   0
-  bn.mulqacc            w11.0, w17.0,   0
-  bn.mulqacc            w11.1, w16.3,   0
-  bn.mulqacc            w11.0, w17.1,  64
-  bn.mulqacc.so w18.L,  w11.1, w17.0,  64
-  bn.mulqacc.so w18.U,  w11.1, w17.1,   0
-
-  /* Store correction factor to compensate for later neglected MSb of x.
-     x is 768 bit wide and therefore the 383 bit right shifted version q1
-     (below) contains 385 bit. Bit 384 of q1 is neglected to allow using a
-     384x384 multiplier. For the MSb of x being set we temporary store u
-     (or zero) here to be used in a later constant time correction of a
-     multiplication with u. Note that this requires the MSb flag being carried
-     over from the multiplication routine. */
-  bn.sel w23, w14, w31, M
-  bn.sel w24, w15, w31, M
-
-  /* Compute q1 = x >> 383
-     q1 = [w11, w10] = [w18, w22, w21] >> 383  = [w18, w21] >> 127
-     => max length q1: 385 bits */
-  bn.rshi w11, w31, w18 >> 127
-  bn.rshi w10, w18, w22 >> 127
-
-  /* Compute q2 = q1*u
-     Instead of full q2 (which would be up to 770 bits) we ignore the MSb of u
-     and the MSb of q1 and correct this later. This allows using a 384x384
-     multiplier. We use the property that u for the modulus of P384 is zero in
-     the bits 383 downto 129 and use a 384x192 multiplication routine.
-     => max. length q2': 513 bit
-     q2' = q1[383:0]*u[128:0] = [w18, w17, w16] = [w11, w10] * [w15, w14] */
-
-  /* 576 = 384*192 bit multiplication kernel */
-  bn.mulqacc.z          w10.0, w14.0,   0
-  bn.mulqacc            w10.0, w14.1,  64
-  bn.mulqacc.so w16.L,  w10.1, w14.0,  64
-  bn.mulqacc            w10.0, w14.2,   0
-  bn.mulqacc            w10.1, w14.1,   0
-  bn.mulqacc            w10.2, w14.0,   0
-  bn.mulqacc            w10.1, w14.2,  64
-  bn.mulqacc            w10.2, w14.1,  64
-  bn.mulqacc.so w16.U,  w10.3, w14.0,  64
-  bn.mulqacc            w10.2, w14.2,   0
-  bn.mulqacc            w10.3, w14.1,   0
-  bn.mulqacc            w11.0, w14.0,   0
-  bn.mulqacc            w10.3, w14.2,  64
-  bn.mulqacc            w11.0, w14.1,  64
-  bn.mulqacc.so w17.L,  w11.1, w14.0,  64
-  bn.mulqacc            w11.0, w14.2,   0
-  bn.mulqacc            w11.1, w14.1,   0
-  bn.mulqacc.so w17.U,  w11.1, w14.2,  64
-
-  /* w14.3 is always zero here due to structure of Barrett constant */
-  bn.mulqacc.wo w18,    w11.1, w14.3,  64
-
-  /* q3 = q2 >> 385
-     In this step, the compensation for the neglected MSbs of q1 and u is
-     carried out underway. To add them in the q2 domain, they would have to be
-     left shifted by 384 bit first. To directly add them we first shift q2' by
-     384 bit to the right, perform the additions, and shift the result another
-     bit to the right. The additions cannot overflow due to leading zeros
-     after shift.
-     q2'' = q2' >> 384 = [w20, w19] = [w18, w17, w16] >> 384
-                                    = [w18, w17] >> 128 */
-  bn.rshi w20, w31, w18 >> 128
-  bn.rshi w19, w18, w17 >> 128
-  /* Add q1. This is unconditional since MSb of u is always 1.
-     This cannot overflow due to leading zeros.
-     q2''' = q2'' + q1 = [w20, w19] = [w20, w19] + [w10, w11] */
-  bn.add w19, w19, w10
-  bn.addc w20, w20, w11
-  /* Conditionally add u (without leading 1) in case of MSb of x being set.
-     This is the "real" q2 but shifted by 384 bits to the right. This cannot
-     overflow due to leading zeros
-     q2'''' = x[767]?q2'''+u[383:0]:q2'''
-            = [w20, w19] + [w24, w23] = q2 >> 384 */
-  bn.add w19, w19, w23
-  bn.addc w20, w20, w24
-  /* finally this gives q3 by shifting the remaining bit to the right
-     q3 = q2 >> 385 = q2'''' >> 1 = [w11, w10] = [w20, w19] >> 1 */
-  bn.rshi w11, w31, w20 >> 1
-  bn.rshi w10, w20, w19 >> 1
-
-  /* r2 = q3*m[511:0] = [w17, w16] = ([w11, w10] * [w13, w12])[511:0]
-     A 384x384 bit multiplication kernel is used here, hence both q3 or p
-     must not be wider than 384 bit. This is always the case for p. For q3 it
-     is the case if a<p and b<p.
-     The 256 highest bits of the multiplication result are not needed,
-     so we do not compute them. */
-  bn.mulqacc.z          w10.0, w12.0,   0
-  bn.mulqacc            w10.0, w12.1,  64
-  bn.mulqacc.so w16.L,  w10.1, w12.0,  64
-  bn.mulqacc            w10.0, w12.2,   0
-  bn.mulqacc            w10.1, w12.1,   0
-  bn.mulqacc            w10.2, w12.0,   0
-  bn.mulqacc            w10.0, w12.3,  64
-  bn.mulqacc            w10.1, w12.2,  64
-  bn.mulqacc            w10.2, w12.1,  64
-  bn.mulqacc.so w16.U,  w10.3, w12.0,  64
-  bn.mulqacc            w10.0, w13.0,   0
-  bn.mulqacc            w10.1, w12.3,   0
-  bn.mulqacc            w10.2, w12.2,   0
-  bn.mulqacc            w10.3, w12.1,   0
-  bn.mulqacc            w11.0, w12.0,   0
-  bn.mulqacc            w10.0, w13.1,  64
-  bn.mulqacc            w10.1, w13.0,  64
-  bn.mulqacc            w10.2, w12.3,  64
-  bn.mulqacc            w10.3, w12.2,  64
-  bn.mulqacc            w11.0, w12.1,  64
-  bn.mulqacc.so w17.L,  w11.1, w12.0,  64
-  bn.mulqacc            w10.1, w13.1,   0
-  bn.mulqacc            w10.2, w13.0,   0
-  bn.mulqacc            w10.3, w12.3,   0
-  bn.mulqacc            w11.0, w12.2,   0
-  bn.mulqacc            w11.1, w12.1,   0
-  bn.mulqacc            w10.2, w13.1,  64
-  bn.mulqacc            w10.3, w13.0,  64
-  bn.mulqacc            w11.0, w12.3,  64
-  bn.mulqacc.so w17.U,  w11.1, w12.2,  64
-
-  /* Compute r = x-r2 = x-q3*p
-     since 0 <= r < 3*p, we only need to consider the lower limbs of x and r2
-     r[511:0] = [w22, w21] - [w17, w16] */
-  bn.sub w21, w21, w16
-  bn.subb w22, w22, w17
-
-  /* Barrett algorithm requires subtraction of the modulus at most two times if
-     result is too large. However in the special case of P-384 we need to
-     subtract only once */
-  bn.sub w16, w21, w12
-  bn.subb w17, w22, w13
-  bn.sel w16, w21, w16, C
-  bn.sel w17, w22, w17, C
-
-  /* return result: c =[w17, w16] =  a * b % p. */
-  ret
-
-
-/**
- * P-384 point addition in projective space
- *
- * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q)
- *         with R, P and Q being valid P-384 curve points
- *           in projective coordinates
- *
- * This routine adds two valid P-384 curve points in projective space.
- * Point addition is performed based on the complete formulas of Bosma and
- * Lenstra for Weierstrass curves as first published in [1] and
- * optimized in [2].
- * The implemented version follows Algorithm 4 of [2] which is an optimized
- * variant for Weierstrass curves with domain parameter 'a' set to a=-3.
- * Numbering of the steps below and naming of symbols follows the
- * terminology of Algorithm 4 of [2].
- * The routine is limited to P-384 curve points due to:
- *   - fixed a=-3 domain parameter
- *   - usage of a P-384 optimized Barrett multiplication kernel
- * This routine runs in constant time.
- *
- * [1] https://doi.org/10.1006/jnth.1995.1088
- * [2] https://doi.org/10.1007/978-3-662-49890-3_16
- *
- * @param[in]  x22: set to 10, pointer to in reg for Barrett routine
- * @param[in]  x23: set to 11, pointer to in reg for Barrett routine
- * @param[in]  x24: set to 16, pointer to in/out reg for Barrett routine
- * @param[in]  x25: set to 17, pointer to in/out reg for Barrett routine
- * @param[in]  x26: dptr_p_p, dmem pointer to point P in dmem (projective)
- * @param[in]  x27: dptr_q_p, dmem pointer to point Q in dmem (projective)
- * @param[in]  x28: dptr_b, dmem pointer to domain parameter b of P-384 in dmem
- * @param[in]  [w13, w12]: p, modulus of underlying field of P-384
- * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
- *                                    modulus p
- * @param[in]  w31: all-zero.
- * @param[out]  [w26, w25]: x_r, x-coordinate of resulting point R
- * @param[out]  [w28, w27]: y_r, y-coordinate of resulting point R
- * @param[out]  [w30, w29]: z_r, z-coordinate of resulting point R
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * clobbered registers: w0 to w30
- * clobbered flag groups: FG0
- */
-.globl proj_add_p384
-proj_add_p384:
-  /* mapping of parameters to symbols of [2] (Algorithm 4):
-     X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q
-     X3 = x_r; Y3 = y_r; Z3 = z_r */
-
-  /* 1: [w1, w0] = t0 <= X1*X2 = dmem[x26+0]*dmem[x27+0] */
-  bn.lid    x22, 0(x26)
-  bn.lid    x23, 32(x26)
-  bn.lid    x24, 0(x27)
-  bn.lid    x25, 32(x27)
-  jal       x1, barrett384_p384
-  bn.mov    w0, w16
-  bn.mov    w1, w17
-
-  /* 2: [w3, w2] = t1 <= Y1*Y2 = dmem[x26+64]*dmem[x27+64] */
-  bn.lid    x22, 64(x26)
-  bn.lid    x23, 96(x26)
-  bn.lid    x24, 64(x27)
-  bn.lid    x25, 96(x27)
-  jal       x1, barrett384_p384
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* 3: [w5, w4] = t2 <= Z1*Z2 = dmem[x26+128]*dmem[x27+128] */
-  bn.lid    x22, 128(x26)
-  bn.lid    x23, 160(x26)
-  bn.lid    x24, 128(x27)
-  bn.lid    x25, 160(x27)
-  jal       x1, barrett384_p384
-  bn.mov    w4, w16
-  bn.mov    w5, w17
-
-  /* 4: [w7, w6] = t3 <= X1+Y1 = dmem[x26+0]+dmem[x26+64] */
-  bn.lid    x22, 0(x26)
-  bn.lid    x23, 32(x26)
-  bn.lid    x24, 64(x26)
-  bn.lid    x25, 96(x26)
-  bn.add    w16, w10, w16
-  bn.addc   w17, w11, w17
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w6, w16
-  bn.mov    w7, w17
-
-  /* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */
-  bn.lid    x22, 0(x27)
-  bn.lid    x23, 32(x27)
-  bn.lid    x24, 64(x27)
-  bn.lid    x25, 96(x27)
-  bn.add    w16, w10, w16
-  bn.addc   w17, w11, w17
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w8, w16
-  bn.mov    w9, w17
-
-  /* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  bn.mov    w16, w8
-  bn.mov    w17, w9
-  jal       x1, barrett384_p384
-  bn.mov    w6, w16
-  bn.mov    w7, w17
-
-  /* 7: [w9, w8] = t4 <= t0+t1 = [w1, w0]+[w3, w2] */
-  bn.add    w16, w0, w2
-  bn.addc   w17, w1, w3
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w8, w16
-  bn.mov    w9, w17
-
-  /* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */
-  bn.sub    w16, w6, w8
-  bn.subb   w17, w7, w9
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w6, w16
-  bn.mov    w7, w17
-
-  /* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */
-  bn.lid    x22, 64(x26)
-  bn.lid    x23, 96(x26)
-  bn.lid    x24, 128(x26)
-  bn.lid    x25, 160(x26)
-  bn.add    w16, w10, w16
-  bn.addc   w17, w11, w17
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w8, w16
-  bn.mov    w9, w17
-
-  /* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */
-  bn.lid    x22, 64(x27)
-  bn.lid    x23, 96(x27)
-  bn.lid    x24, 128(x27)
-  bn.lid    x25, 160(x27)
-  bn.add    w16, w10, w16
-  bn.addc   w17, w11, w17
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */
-  bn.mov    w10, w8
-  bn.mov    w11, w9
-  bn.mov    w16, w25
-  bn.mov    w17, w26
-  jal       x1, barrett384_p384
-  bn.mov    w8, w16
-  bn.mov    w9, w17
-
-  /* 12: [w26, w25] = X3 <= t1+t2 = [w3, w2]+[w5, w4] */
-  bn.add    w16, w2, w4
-  bn.addc   w17, w3, w5
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */
-  bn.sub    w16, w8, w25
-  bn.subb   w17, w9, w26
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w8, w16
-  bn.mov    w9, w17
-
-  /* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */
-  bn.lid    x22, 0(x26)
-  bn.lid    x23, 32(x26)
-  bn.lid    x24, 128(x26)
-  bn.lid    x25, 160(x26)
-  bn.add    w16, w10, w16
-  bn.addc   w17, w11, w17
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */
-  bn.lid    x22, 0(x27)
-  bn.lid    x23, 32(x27)
-  bn.lid    x24, 128(x27)
-  bn.lid    x25, 160(x27)
-  bn.add    w16, w10, w16
-  bn.addc   w17, w11, w17
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */
-  bn.mov    w10, w25
-  bn.mov    w11, w26
-  bn.mov    w16, w27
-  bn.mov    w17, w28
-  jal       x1, barrett384_p384
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 17: [w28, w27] = Y3 <= t0+t2 = [w1, w0]+[w5, w4] */
-  bn.add    w16, w0, w4
-  bn.addc   w17, w1, w5
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */
-  bn.sub    w16, w25, w27
-  bn.subb   w17, w26, w28
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */
-  bn.lid    x22, 0(x28)
-  bn.lid    x23, 32(x28)
-  bn.mov    w16, w4
-  bn.mov    w17, w5
-  jal       x1, barrett384_p384
-  bn.mov    w29, w16
-  bn.mov    w30, w17
-
-  /* 20: [w26, w25] = X3 <= Y3-Z3 = [w28, w27]-[w30, w29] */
-  bn.sub    w16, w27, w29
-  bn.subb   w17, w28, w30
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */
-  bn.add    w16, w25, w25
-  bn.addc   w17, w26, w26
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w29, w16
-  bn.mov    w30, w17
-
-  /* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */
-  bn.add    w16, w25, w29
-  bn.addc   w17, w26, w30
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */
-  bn.sub    w16, w2, w25
-  bn.subb   w17, w3, w26
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w29, w16
-  bn.mov    w30, w17
-
-  /* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */
-  bn.add    w16, w2, w25
-  bn.addc   w17, w3, w26
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */
-  bn.lid    x22, 0(x28)
-  bn.lid    x23, 32(x28)
-  bn.mov    w16, w27
-  bn.mov    w17, w28
-  jal       x1, barrett384_p384
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 26: [w3, w2] = t1 <= t2+t2 = [w5, w4]+[w5, w4] */
-  bn.add    w16, w4, w4
-  bn.addc   w17, w5, w5
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */
-  bn.add    w16, w2, w4
-  bn.addc   w17, w3, w5
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w4, w16
-  bn.mov    w5, w17
-
-  /* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */
-  bn.sub    w16, w27, w4
-  bn.subb   w17, w28, w5
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */
-  bn.sub    w16, w27, w0
-  bn.subb   w17, w28, w1
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */
-  bn.add    w16, w27, w27
-  bn.addc   w17, w28, w28
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */
-  bn.add    w16, w2, w27
-  bn.addc   w17, w3, w28
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */
-  bn.add    w16, w0, w0
-  bn.addc   w17, w1, w1
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */
-  bn.add    w16, w2, w0
-  bn.addc   w17, w3, w1
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w0, w16
-  bn.mov    w1, w17
-
-  /* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */
-  bn.sub    w16, w0, w4
-  bn.subb   w17, w1, w5
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w0, w16
-  bn.mov    w1, w17
-
-  /* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */
-  bn.mov    w10, w8
-  bn.mov    w11, w9
-  bn.mov    w16, w27
-  bn.mov    w17, w28
-  jal       x1, barrett384_p384
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* 36: [w5, w4] = t2 <= t0*Y3 = [w1, w0]*[w28, w27] */
-  bn.mov    w10, w0
-  bn.mov    w11, w1
-  bn.mov    w16, w27
-  bn.mov    w17, w28
-  jal       x1, barrett384_p384
-  bn.mov    w4, w16
-  bn.mov    w5, w17
-
-  /* 37: [w28, w27] = Y3 <= X3*Z3 = [w26, w25]*[w30, w29] */
-  bn.mov    w10, w25
-  bn.mov    w11, w26
-  bn.mov    w16, w29
-  bn.mov    w17, w30
-  jal       x1, barrett384_p384
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 38: [w28, w27] = Y3 <= Y3+t2 = [w28, w27]+[w5, w4] */
-  bn.add    w16, w27, w4
-  bn.addc   w17, w28, w5
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  /* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  bn.mov    w16, w25
-  bn.mov    w17, w26
-  jal       x1, barrett384_p384
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 40: [w26, w25] = X3 <= X3-t1 = [w26, w25]-[w3, w2] */
-  bn.sub    w16, w25, w2
-  bn.subb   w17, w26, w3
-  bn.add    w10, w16, w12
-  bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */
-  bn.mov    w10, w8
-  bn.mov    w11, w9
-  bn.mov    w16, w29
-  bn.mov    w17, w30
-  jal       x1, barrett384_p384
-  bn.mov    w29, w16
-  bn.mov    w30, w17
-
-  /* 42: [w3, w2] = t1 <= t3*t0 = [w7, w6]*[w1, w0] */
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  bn.mov    w16, w0
-  bn.mov    w17, w1
-  jal       x1, barrett384_p384
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* 43: [w30, w29] = Z3 <= Z3+t1 = [w30, w29]+[w3, w2] */
-  bn.add    w16, w29, w2
-  bn.addc   w17, w30, w3
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w29, w16
-  bn.mov    w30, w17
-
-  ret
-
-
-/**
- * Convert projective coordinates of a P-384 curve point to affine coordinates
- *
- * returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
- *              where P is a valid P-384 curve point,
- *                    x_a and y_a are the resulting affine coordinates of the
- *                      curve point,
- *                    x,y and z are a set of projective coordinates of the
- *                      point and
- *                    p is the modulus of the P-384 underlying finite field.
- *
- * This routine computes the affine coordinates for a set of projective
- * coordinates of a valid P-384 curve point. The routine performs the required
- * divisions by computing the multiplicative modular inverse of the
- * projective z-coordinate in the underlying finite field of the P-384 curve.
- * For inverse computation Fermat's little theorem is used, i.e.
- * we compute z^-1 = z^(p-2) mod p.
- * For exponentiation a 16 step addition chain is used.
- * Source of the addition chain is the addchain project:
- * https://github.com/mmcloughlin/addchain/
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in]  [w26,w25]: x, x-coordinate of curve point (projective).
- * @param[in]  [w26,w25]: y, y-coordinate of curve point (projective).
- * @param[in]  [w30,w29]: z, z-coordinate of curve point (projective).
- * @param[in]  [w13, w12]: p, modulus of P-384.
- * @param[in]  [w15, w14]: u, pre-computed Barrett constant for p,
- *                            lower 384 bits, i.e. (2^(2*384) div p)[383:0].
- * @param[in] w31: all-zero.
- * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
- * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
- *
- * clobbered registers: w0 to w28
- * clobbered flag groups: FG0
- */
-proj_to_affine_p384:
-
-  /* Exp: 0b10 = 2*0b1
-     Val: r10 = z^2 mod p
-          [w17,w16] <= [w30,w29]^2 mod [w13,w12] */
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  bn.mov    w16, w29
-  bn.mov    w17, w30
-  jal       x1, barrett384_p384
-
-  /* Exp: 0b11 = 0b1+0b10
-     Val: r11 <= z*r10 mod p
-          [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, barrett384_p384
-
-  /* Exp: 0b110 = 2*0b11
-     Val: r110 = r11^2 mod p
-          [w17,w16] <= [w17,w16]^2 mod [w13,w12] */
-  bn.mov    w10, w16
-  bn.mov    w11, w17
-  jal       x1, barrett384_p384
-
-  /* Exp: 0b111 = 0b1+0b110
-     Val: r111 <= z*r110  mod p
-          [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, barrett384_p384
-  bn.mov    w0, w16
-  bn.mov    w1, w17
-
-  /* Exp: 0b111000 = 0b111<<3
-     Val: r111000 <= r111^(2^3)  mod p
-          [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
-  loopi     3, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-
-  /* Exp: 0b1111111 = 0b111+0b111000
-     Val: r1111111 <= r111*r111000 mod p
-          [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
-  bn.mov    w10, w0
-  bn.mov    w11, w1
-  jal       x1, barrett384_p384
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* Exp: 2^12-1 = (0b1111111<<6)+0b111111
-     Val: r_12_1 <= r111111^(2^6)*r111111 mod p
-          [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
-  loopi     6, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w2
-  bn.mov    w11, w3
-  jal       x1, barrett384_p384
-  bn.mov    w4, w16
-  bn.mov    w5, w17
-
-  /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
-     Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
-          [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
-  loopi     12, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w4
-  bn.mov    w11, w5
-  jal       x1, barrett384_p384
-
-  /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
-     Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
-          [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
-  loopi     6, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w2
-  bn.mov    w11, w3
-  jal       x1, barrett384_p384
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* Exp: 2^31-1 <= (2^30-1)*2+0b1
-     Val: r_31_1 <= r30_1^2*z mod p
-          [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
-  bn.mov    w10, w16
-  bn.mov    w11, w17
-  jal       x1, barrett384_p384
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, barrett384_p384
-  bn.mov    w6, w16
-  bn.mov    w7, w17
-
-  /* Exp: 2^32-1 <= (2^30-1)*2+0b1
-     Val: r_32_1 <= r31_1^2*z mod p
-          [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
-  bn.mov    w10, w16
-  bn.mov    w11, w17
-  jal       x1, barrett384_p384
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, barrett384_p384
-  bn.mov    w9, w16
-  bn.mov    w8, w17
-
-  /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
-     Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
-          [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
-  loopi     31, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  jal       x1, barrett384_p384
-  bn.mov    w6, w16
-  bn.mov    w7,w17
-
-  /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
-     Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
-          [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
-  loopi     63, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  jal       x1, barrett384_p384
-  bn.mov    w6, w16
-  bn.mov    w7, w17
-
-  /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
-     Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
-          [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
-  loopi     126, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  jal       x1, barrett384_p384
-
-  /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
-     Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
-          [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
-  loopi     3, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w0
-  bn.mov    w11, w1
-  jal       x1, barrett384_p384
-
-  /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
-     Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
-          [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
-                       *[w30,w29] mod [w13,w12] */
-  loopi     33, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w9
-  bn.mov    w11, w8
-  jal       x1, barrett384_p384
-  loopi     94, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w2
-  bn.mov    w11, w3
-  jal       x1, barrett384_p384
-  loopi     2, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-    nop
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, barrett384_p384
-
-  /* store inverse [w1,w0] <= [w17,w16] = z_inv*/
-  bn.mov w0, w16
-  bn.mov w1, w17
-
-  /* convert x-coordinate to affine space
-     [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
-  bn.mov    w10, w25
-  bn.mov    w11, w26
-  jal       x1, barrett384_p384
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* convert y-coordinate to affine space
-     [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
-  bn.mov    w10, w27
-  bn.mov    w11, w28
-  bn.mov    w16, w0
-  bn.mov    w17, w1
-  jal       x1, barrett384_p384
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  ret
-
-
-/**
- * Fetch curve point from dmem, randomize z-coordinate and store point in dmem
- *
- * returns P = (x, y, z) = (x_a*z, y_a*z, z)
- *         with P being a valid P-384 curve point in projective coordinates
- *              x_a and y_a being the affine coordinates as fetched from dmem
- *              z being a randomized z-coordinate
- *
- * This routines fetches the affine x- and y-coordinates of a curve point from
- * dmem and computes a valid set of projective coordinates. The z-coordinate is
- * randomized and x and y are scaled appropriately. The resulting projective
- * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
- * i.e. each coordinate is stored 512 bit aligned, little endian.
- * This routine runs in constant time.
- *
- * @param[in]  x20: dptr_x, pointer to dmem location containing affine
- *                          x-coordinate of input point
- * @param[in]  x21: dptr_y, pointer to dmem location containing affine
- *                          y-coordinate of input point
- * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
- *                                    modulus p
- * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
- * @param[in]  w31: all-zero
- * @param[in]  x18: dptr_p_p, pointer to dmem location to store resulting point
- *                            in projective space
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- *        the upper limb of projective y-coordinate.
- *
- * clobbered registers: x10, x11 to x13
-  *                     w2, w3, w8 to w11, w16 to w24, w29, w30
- * clobbered flag groups: FG0
- */
-store_proj_randomize:
-
-  /* get a 384-bit random number
-    [w3, w2] = random(384) */
-  bn.wsrr   w2, 1
-  bn.wsrr   w3, 1
-  bn.rshi   w3, w31, w3 >> 128
-
-  /* reduce random number
-     [w2, w3] = z <= [w2, w3] mod p */
-  bn.sub   w10, w2, w12
-  bn.subb  w11, w3, w13
-  bn.sel   w2, w2, w10, C
-  bn.sel   w3, w3, w11, C
-
-  bn.mov w10, w2
-  bn.mov w11, w3
-
-  /* store z-coordinate
-     dmem[x20+128] = [w10, w11] */
-  li        x10, 10
-  li        x11, 11
-  bn.sid    x10, 128(x18)
-  bn.sid    x11, 160(x18)
-
-  /* fetch x-coordinate from dmem
-     [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
-  li x12, 16
-  li x13, 17
-  bn.lid    x12,  0(x20)
-  bn.lid    x13, 32(x20)
-
-  /* scale and store x-coordinate
-     [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
-       x_p <= [w11, w10] * [w17, w16] = z*x  mod p */
-
-  jal       x1, barrett384_p384
-  bn.sid    x12,  0(x18)
-  bn.sid    x13, 32(x18)
-
-  /* fetch y-coordinate from dmem
-     [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
-  bn.lid    x12,  0(x21)
-  bn.lid    x13, 32(x21)
-
-  /* scale and store y-coordinate
-     [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
-       y_p <= [w11, w10] * [w17, w16] = z*y  mod p */
-  bn.mov w10, w2
-  bn.mov w11, w3
-  jal       x1, barrett384_p384
-  bn.sid    x12, 64(x18)
-  bn.sid    x13, 96(x18)
-
-  ret
-
-
-/**
- * P-384 scalar point multiplication in affine space
- *
- * returns R = k*P = k*(x_p, y_p)
- *         where R, P are valid P-384 curve points in affine coordinates,
- *               k is a 384-bit scalar.
- *
- * This routine performs scalar multiplication based on the group laws
- * of Weierstrass curves.
- * A constant time double-and-add algorithm (sometimes referred to as
- * double-and-add-always) is used.
- * Due to the P-384 optimized implementations of the internally called routines
- * for point addition and doubling, this routine is limited to P-384 curves.
- * The routine makes use of blinding by additive splitting the
- * exponent/scalar d into two shares. The double-and-add loop operates on both
- * shares in parallel applying Shamir's trick.
- *
- * @param[in]  x9: dptr_rnd, pointer to location in dmem containing random
- *                           number to be used for additive splitting of scalar
- * @param[in]  x19: dptr_k, pointer to scalar k (0 < k < n) in dmem
- * @param[in]  x20: dptr_x, pointer to affine x-coordinate in dmem
- * @param[in]  x21: dptr_y, pointer to affine y-coordinate in dmem
- * @param[in]  x28: dptr_b, pointer to domain parameter b of P-384 in dmem
- * @param[in]  x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
- * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u
- *                                  corresponding to modulus p
- * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
- * @param[in]  [w11, w10]: n, domain parameter of P-384 curve
- *                            (order of basepoint G)
- * @param[in]  w31: all-zero
- * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R.
- * @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R.
- *
- * Scratchpad memory layout:
- * The routine expects at least 704 bytes of scratchpad memory at dmem
- * location 'scratchpad' (sp). Internally the scratchpad is used as follows:
- * dptr_sp     .. dptr_sp+191: point P, projective
- * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
- * dptr_sp+256 .. dptr_sp+447: point 2P, projective
- * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
- * dptr_sp+512 .. dptr_sp+703: point Q, projective
- *
- * Projective coordinates of a point are kept in dmem in little endian format
- * with the individual coordinates 512 bit aligned. The coordinates are stored
- * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
- * curve point occupies 6 consecutive 256-bit dmem cells.
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- *        the computed affine y-coordinate.
- *
- * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
- * clobbered flag groups: FG0
- */
-scalar_mult_int_p384:
-
-  /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
-     resetting in very call to point addition routine */
-  li        x22, 10
-  li        x23, 11
-  li        x24, 16
-  li        x25, 17
-
-  /* fetch externally supplied random number from dmem
-     [w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */
-  li        x2, 0
-  bn.lid    x2++, 0(x9)
-  bn.lid    x2++, 32(x9)
-
-  /* 1st share (reduced rnd)
-     s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */
-  bn.sub    w9, w0, w10
-  bn.subb   w8, w1, w11
-  bn.sel    w0, w0, w9, C
-  bn.sel    w1, w1, w8, C
-
-  /* load scalar k from dmem
-     [w3, w2] = k <= dmem[dptr_k] = [dmem[x19], dmem[x19+32]] */
-  bn.lid    x2++, 0(x19)
-  bn.lid    x2, 32(x19)
-
-  /* 2nd share (k-s0)
-     s1 = [w3, w2] <= k - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */
-  bn.sub    w2, w2, w0
-  bn.subb   w3, w3, w1
-  bn.add    w8, w2, w10
-  bn.addc   w9, w3, w11
-  bn.sel    w2, w8, w2, C
-  bn.sel    w3, w9, w3, C
-
-  /* left align both shares for probing of MSB in loop body */
-  bn.rshi   w1, w1, w0 >> 128
-  bn.rshi   w0, w0, w31 >> 128
-  bn.rshi   w3, w3, w2 >> 128
-  bn.rshi   w2, w2, w31 >> 128
-
-   /* store shares in scratchpad */
-  li        x2, 0
-  bn.sid    x2++, 192(x30)
-  bn.sid    x2++, 224(x30)
-  bn.sid    x2++, 448(x30)
-  bn.sid    x2++, 480(x30)
-
-  /* get randomized projective coodinates of curve point
-     P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
-  add       x18, x30, 0
-  jal       x1, store_proj_randomize
-
-  /* double point P
-     2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
-  add       x27, x30, x0
-  add       x26, x30, x0
-  jal       x1, proj_add_p384
-
-  /* store point 2P in scratchpad @w30+256
-     dmem[dptr_sc+256] = [w30:w25] = 2P */
-  li        x2, 25
-  bn.sid    x2++, 256(x30)
-  bn.sid    x2++, 288(x30)
-  bn.sid    x2++, 320(x30)
-  bn.sid    x2++, 352(x30)
-  bn.sid    x2++, 384(x30)
-  bn.sid    x2++, 416(x30)
-
-  /* init point Q = (0,1,0) for double-and-add in scratchpad */
-  /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
-  addi      x26, x30, 512
-  li        x2, 30
-  bn.addi   w30, w31, 1
-  bn.sid    x2++, 64(x26)
-  bn.sid    x2, 0(x26)
-  bn.sid    x2, 32(x26)
-  bn.sid    x2, 96(x26)
-  bn.sid    x2, 128(x26)
-  bn.sid    x2, 160(x26)
-
-  /* double-and-add loop with decreasing index */
-  loopi     384, 85
-
-    /* double point Q
-       Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
-    add       x27, x26, x0
-    jal       x1, proj_add_p384
-
-    /* store Q in dmem
-     dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
-    li        x2, 25
-    bn.sid    x2++, 0(x26)
-    bn.sid    x2++, 32(x26)
-    bn.sid    x2++, 64(x26)
-    bn.sid    x2++, 96(x26)
-    bn.sid    x2++, 128(x26)
-    bn.sid    x2++, 160(x26)
-
-    /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
-       is 1.
-       If only one MSb is set, select P for addition.
-       If both MSbs are set, select 2P for addition.
-       (If neither MSB is set, 2P will be selected but result discarded.) */
-    li        x2, 0
-    bn.lid    x2++, 224(x30)
-    bn.lid    x2, 480(x30)
-    bn.xor    w8, w0, w1
-    /* Create conditional offeset into scratchpad.
-       if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
-    csrrs     x3, 0x7c0, x0
-    xori      x3, x3, -1
-    andi      x3, x3, 2
-    slli      x27, x3, 7
-    add       x27, x27, x30
-
-    /* Reload randomized projective coodinates for curve point P.
-       P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
-    jal       x1, store_proj_randomize
-
-    /* Add points Q+P or Q+2P depending on offset in x27.
-       Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
-    jal       x1, proj_add_p384
-
-    /* load shares from scratchpad
-       [w1, w0] = s0; [w3, w2] = s1 */
-    li        x2, 0
-    bn.lid    x2++, 192(x30)
-    bn.lid    x2++, 224(x30)
-    bn.lid    x2++, 448(x30)
-    bn.lid    x2++, 480(x30)
-
-    /* M = s0[511] | s1[511] */
-    bn.or     w8, w1, w3
-
-    /* load q from scratchpad
-        Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
-    li        x2, 4
-    bn.lid    x2++, 0(x26)
-    bn.lid    x2++, 32(x26)
-    bn.lid    x2++, 64(x26)
-    bn.lid    x2++, 96(x26)
-    bn.lid    x2++, 128(x26)
-    bn.lid    x2++, 160(x26)
-
-    /* select either Q or Q_a
-       if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
-    bn.sel    w25, w25, w4, M
-    bn.sel    w26, w26, w5, M
-    bn.sel    w27, w27, w6, M
-    bn.sel    w28, w28, w7, M
-    bn.sel    w29, w29, w8, M
-    bn.sel    w30, w30, w9, M
-
-    /* store Q in dmem
-     dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
-    li        x2, 25
-    bn.sid    x2++, 0(x26)
-    bn.sid    x2++, 32(x26)
-    bn.sid    x2++, 64(x26)
-    bn.sid    x2++, 96(x26)
-    bn.sid    x2++, 128(x26)
-    bn.sid    x2++, 160(x26)
-
-    /* left shift both shares
-       s0 <= s0 << 1 ; s1 <= s1 << 1 */
-    bn.add    w0, w0, w0
-    bn.addc   w1, w1, w1
-    bn.add    w2, w2, w2
-    bn.addc   w3, w3, w3
-    /* store both shares in scratchpad */
-    li        x2, 0
-    bn.sid    x2++, 192(x30)
-    bn.sid    x2++, 224(x30)
-    bn.sid    x2++, 448(x30)
-    bn.sid    x2++, 480(x30)
-
-
-    /* Get a fresh random number and scale the coordinates of 2P.
-       (scaling each proj. coordinate by same factor results in same point) */
-
-    /* get a 384-bit random number */
-    bn.wsrr   w2, 1
-    bn.wsrr   w3, 1
-    bn.rshi   w3, w31, w3 >> 128
-
-    /* reduce random number
-      [w2, w3] = z <= [w2, w3] mod p */
-    bn.sub    w10, w2, w12
-    bn.subb   w11, w3, w13
-    bn.sel    w2, w2, w10, C
-    bn.sel    w3, w3, w11, C
-
-    /* scale all coordinates in scratchpad */
-    li        x2, 16
-    li        x3, 17
-    /* x-coordinate */
-    bn.mov    w10, w2
-    bn.mov    w11, w3
-    bn.lid    x2, 256(x30)
-    bn.lid    x3, 288(x30)
-    jal       x1, barrett384_p384
-    bn.sid    x2, 256(x30)
-    bn.sid    x3, 288(x30)
-    /* y-coordinate */
-    bn.mov    w10, w2
-    bn.mov    w11, w3
-    bn.lid    x2, 320(x30)
-    bn.lid    x3, 352(x30)
-    jal       x1, barrett384_p384
-    bn.sid    x2, 320(x30)
-    bn.sid    x3, 352(x30)
-    /* z-coordinate */
-    bn.mov    w10, w2
-    bn.mov    w11, w3
-    bn.lid    x2, 384(x30)
-    bn.lid    x3, 416(x30)
-    jal       x1, barrett384_p384
-    bn.sid    x2, 384(x30)
-    bn.sid    x3, 416(x30)
-
-  /* convert coordinates to affine space */
-  jal       x1, proj_to_affine_p384
-
-  ret
-
-
-/**
- * Externally callable wrapper for P-384 scalar point multiplication
- *
- * returns R = k*P = k*(x_p, y_p)
- *         where R, P are valid P-384 curve points in affine coordinates,
- *               k is a 384-bit scalar..
- *
- * Sets up context and calls internal scalar multiplication routine.
- * This routine runs in constant time.
- *
- * @param[in]  dmem[0]: dK, pointer to location in dmem containing scalar k
- * @param[in]  dmem[4]: dRnd, pointer to location in dmem containing random
- *                        number for blinding
- * @param[in]  dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
- * @param[in]  dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
- *
- * 384-bit quantities have to be provided in dmem in little-endian format,
- * 512 bit aligned, with the highest 128 bit set to zero.
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- *        the computed affine y-coordinate.
- *
- * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
- *                      w0 to w30
- * clobbered flag groups: FG0
- */
-.globl scalar_mult_p384
-scalar_mult_p384:
-
-  /* set dmem pointer to point x-coordinate */
-  la        x20, dptr_x
-  lw        x20, 0(x20)
-
-  /* set dmem pointer to point y-coordinate */
-  la        x21, dptr_y
-  lw        x21, 0(x21)
-
-  /* set dmem pointer to scalar k */
-  la        x19, dptr_k
-  lw        x19, 0(x19)
-
-  /* set pointer to blinding parameter */
-  la        x9, dptr_rnd
-  lw        x9, 0(x9)
-
-  /* set dmem pointer to domain parameter b */
-  la        x28, p384_b
-
-  /* set dmem pointer to scratchpad */
-  la        x30, scratchpad
-
-  /* load domain parameter p (modulus)
-     [w13, w12] = p = dmem[p384_p] */
-  li        x2, 12
-  la        x3, p384_p
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load Barrett constant u for modulus p
-     [w15, w14] = u_p = dmem[p384_u_p] */
-  li        x2, 14
-  la        x3, p384_u_p
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load domain parameter n (order of base point)
-     [w11, w10] = n = dmem[p384_n] */
-  li        x2, 10
-  la        x3, p384_n
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* init all-zero reg */
-  bn.xor    w31, w31, w31
-
-  jal       x1, scalar_mult_int_p384
-
-  ret
-
-
-/**
- * Variable-time modular multiplicative inverse computation
- *
- * returns x_inv = x^-1 mod m
- *
- * This routine computes the modular multiplicative inverse for any x < m in
- * the finite field GF(m) where m is prime.
- *
- * For inverse computation, Fermat's little theorem is used, i.e.
- * we compute x^-1 = x^(m-2) mod m.
- * For exponentiation we use a standard, variable-time (!) square and multiply
- * algorithm.
- *
- * This routine is mainly intended to be used for inversion of scalars in
- * context of the P-384 curve. In theory, it can be used with any 384-bit
- * modulus m with a corresponding 385-bit Barrett constant u,
- * where u[383:192] = 0.
- *
- * Note: When used for P-384 scalar inversion, the routine will need 672 calls
- * to the multiplication routine. By using an adder chain this could be reduced
- * to ~433 multiplications, however, at the cost of a significant codes size
- * increase.
- *
- * Note: This routine runs in variable-time w.r.t. the modulus. It should only
- * be used with a non-secret modulus.
- *
- * @param[in]  [w13, w12]: m, 384 bit modulus
- * @param[in]  [w15, w14]: u[383:0], lower 384 bit of pre-computed Barrett
- *                          constant corresponding to modulus m
- * @param[in]  [w30, w29]: x, 384 bit operand
- * @param[in]  w31, all-zero
- * @param[out] [w17, w16]: x_inv, modular multiplicative inverse
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * clobbered registers: x2, w2, w3, w10, w11, w16 to w24
- * clobbered flag groups: FG0
- */
-mod_inv_n_p384:
-
-  /* subtract 2 from modulus for Fermat's little theorem
-     [w13,w12] <= m - 2 = [w11,w10]-2 (left aligned) */
-  bn.subi   w2, w12, 2
-  bn.subb   w3, w13, w31
-  bn.rshi   w3, w3, w2 >> 128
-  bn.rshi   w2, w2, w31 >> 128
-
-  /* init square and multiply: [w17,w16] = 1 */
-  bn.addi   w16, w31, 1
-  bn.mov    w17, w31
-
-  /* square and multiply loop */
-  loopi     384, 12
-
-    /* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, barrett384_p384
-
-    /* shift MSB into carry flag
-       [w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */
-    bn.add    w2, w2, w2
-    bn.addc   w3, w3, w3
-
-    /* skip multiplication if C flag not set */
-    csrrs     x2, 1984, x0
-    andi      x2, x2, 1
-    beq       x2, x0, nomul
-
-    /* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */
-    bn.mov    w10, w29
-    bn.mov    w11, w30
-    jal       x1, barrett384_p384
-
-    nomul:
-    nop
-
-  ret
-
-
-/**
- * P-384 ECDSA signature generation
- *
- * returns the signature as the pair r, s with
- *         r = x_1  mod n
- *     and s = k^(-1)(msg + r*d)  mod n
- *         where x_1 is the affine x-coordinate of the curve point k*G,
- *               G is the curve's base point,
- *               k is a supplied secret random number,
- *               n is the order of the base point G of P-256,
- *               msg is the message to be signed, and
- *               d is the private key.
- *
- * This routine runs in constant time.
- *
- * @param[in]  dmem[0]: dptr_k, pointer to a 384 bit random secret in dmem
- * @param[in]  dmem[4]: dptr_rnd, pointer to location in dmem containing
- *                       a 384-bit random number for blinding
- * @param[in]  dmem[8]: dptr_msg, pointer to the message to be signed in dmem
- * @param[in]  dmem[12]: dptr_r, pointer to dmem location where s component
- *                               of signature will be placed
- * @param[in]  dmem[16]: dptr_s, pointer to dmem location where r component
- *                               of signature will be placed
- * @param[in]  dmem[28]: dptr_d, pointer to private key d in dmem
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * clobbered registers: x2, x3, x9 to x13, x18 to x28, x30
- *                      w0 to w31
- * clobbered flag groups: FG0
- */
-.globl p384_sign
-p384_sign:
-  /* init all-zero reg */
-  bn.xor    w31, w31, w31
-
-  /* set dmem pointer to domain parameter b */
-  la        x28, p384_b
-
-  /* set dmem pointer to basepoint x-coordinate */
-  la        x20, p384_gx
-
-  /* set dmem pointer to basepoint y-coordinate */
-  la        x21, p384_gy
-
-  /* set dmem pointer to secret random scalar k */
-  la        x19, dptr_k
-  lw        x19, 0(x19)
-
-  /* set pointer to blinding parameter */
-  la        x9, dptr_rnd
-  lw        x9, 0(x9)
-
-  /* set dmem pointer to scratchpad */
-  la        x30, scratchpad
-
-  /* load domain parameter p (modulus)
-     [w13, w12] <= p = dmem[dptr_p] */
-  li        x2, 12
-  la        x3, p384_p
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load Barrett constant u for modulus p
-     [w15, w14] = u_p = dmem[p384_u_p] */
-  li        x2, 14
-  la        x3, p384_u_p
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load domain parameter n (order of base point)
-     [w11, w10] = n = dmem[p384_n] */
-  li        x2, 10
-  la        x3, p384_n
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* scalar multiplication with base point
-     [w28:w25] <= (x_1, y_1) = k*G */
-  jal       x1, scalar_mult_int_p384
-
-  /* store r of signature in dmem: dmem[dptr_r] <= r = [w26,w25] */
-  li        x2, 25
-  la        x3, dptr_r
-  lw        x3, 0(x3)
-  bn.sid    x2++, 0(x3)
-  bn.sid    x2++, 32(x3)
-
-  /* load secret random number k from dmem
-     [w30,w29] <= k = dmem[dptr_k] */
-  li        x2, 29
-  bn.lid    x2++, 0(x19)
-  bn.lid    x2++, 32(x19)
-
-  /* load domain parameter n (order of base point)
-     [w13, w12] <= p = dmem[p384_n] */
-  li        x2, 12
-  la        x3, p384_n
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load Barrett constant u_n for modulus n for scalar operations
-     [w15, w14] <= u_m = dmem[p384_u_n] */
-  li        x2, 14
-  la        x3, p384_u_n
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* modular multiplicative inverse of k
-     [w3, w2] <= [w17, w16] <= k^(-1) mod n */
-  jal       x1, mod_inv_n_p384
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* load private key d from dmem
-     [w11,w10] <= d = dmem[dptr_d] */
-  li        x2, 10
-  la        x3, dptr_d
-  lw        x3, 0(x3)
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* [w17, w16] <= k^(-1)*d mod n = [w17, w16] * [w11, w10] mod [w13, w12] */
-  jal       x1, barrett384_p384
-
-  /*  [w5, w4] <= [w17, w16]
-        <= r * (k^(-1)*d) mod n = [w26, w25] * [w17, w16] mod [w13, w12] */
-  bn.mov    w10, w25
-  bn.mov    w11, w26
-  jal       x1, barrett384_p384
-  bn.mov    w4, w16
-  bn.mov    w5, w17
-
-  /* load message from dmem
-     [w11, w10] <= msg = dmem[dptr_msg] */
-  li        x2, 10
-  la        x3, dptr_msg
-  lw        x3, 0(x3)
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* [w17, w16] <= k^(-1) * msg = [w3, w2]*[w17, w16] mod n */
-  bn.mov    w16, w2
-  bn.mov    w17, w3
-  jal       x1, barrett384_p384
-
-  /* [w28, w27] <= s' = k^(-1)*msg + k^(-1)*r*d  = [w17, w16] + [w5, w4]*/
-  bn.add    w27, w16, w4
-  bn.addc   w28, w17, w5
-
-  /* reduce s: [w28, w27] <= s <= s' mod n = [w28, w27] mod [w13, w12] */
-  bn.sub    w10, w27, w12
-  bn.subb   w11, w28, w13
-  bn.sel    w27, w27, w10, C
-  bn.sel    w28, w28, w11, C
-
-  /* store s of signature in dmem: dmem[dptr_s] <= s = [w28, w27] */
-  li        x2, 27
-  la        x3, dptr_s
-  lw        x3, 0(x3)
-  bn.sid    x2++, 0(x3)
-  bn.sid    x2++, 32(x3)
-
-  ret
-
-
-/* constants, pointers and scratchpad memory */
-.section .data
-
-/* pointer to k (dptr_k) */
-.globl dptr_k
-dptr_k:
-  .zero 4
-
-/* pointer to rnd (dptr_rnd) */
-.globl dptr_rnd
-dptr_rnd:
-  .zero 4
-
-/* pointer to msg (dptr_msg) */
-.globl dptr_msg
-dptr_msg:
-  .zero 4
-
-/* pointer to R (dptr_r) */
-.globl dptr_r
-dptr_r:
-  .zero 4
-
-/* pointer to S (dptr_s) */
-.globl dptr_s
-dptr_s:
-  .zero 4
-
-/* pointer to X (dptr_x) */
-.globl dptr_x
-dptr_x:
-  .zero 4
-
-/* pointer to Y (dptr_y) */
-.globl dptr_y
-dptr_y:
-  .zero 4
-
-/* pointer to D (dptr_d) */
-.globl dptr_d
-dptr_d:
-  .zero 4
-
-/* P-384 domain parameter b */
-.globl p384_b
-p384_b:
-  .word 0xd3ec2aef
-  .word 0x2a85c8ed
-  .word 0x8a2ed19d
-  .word 0xc656398d
-  .word 0x5013875a
-  .word 0x0314088f
-  .word 0xfe814112
-  .word 0x181d9c6e
-  .word 0xe3f82d19
-  .word 0x988e056b
-  .word 0xe23ee7e4
-  .word 0xb3312fa7
-  .zero 16
-
-/* P-384 domain parameter p (modulus) */
-.globl p384_p
-p384_p:
-  .word 0xffffffff
-  .word 0x00000000
-  .word 0x00000000
-  .word 0xffffffff
-  .word 0xfffffffe
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .zero 16
-
-/* barrett constant u for modulus p */
-.globl p384_u_p
-p384_u_p:
-  .word 0x00000001
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0x00000000
-  .word 0x00000001
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .zero 16
-
-/* P-384 domain parameter n (order of base point) */
-p384_n:
-  .word 0xccc52973
-  .word 0xecec196a
-  .word 0x48b0a77a
-  .word 0x581a0db2
-  .word 0xf4372ddf
-  .word 0xc7634d81
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .word 0xffffffff
-  .zero 16
-
-/* barrett constant u for n */
-p384_u_n:
-  .word 0x333ad68d
-  .word 0x1313e695
-  .word 0xb74f5885
-  .word 0xa7e5f24d
-  .word 0x0bc8d220
-  .word 0x389cb27e
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .word 0x00000000
-  .zero 16
-
-/* P-384 basepoint G affine x-coordinate */
-p384_gx:
-  .word 0x72760ab7
-  .word 0x3a545e38
-  .word 0xbf55296c
-  .word 0x5502f25d
-  .word 0x82542a38
-  .word 0x59f741e0
-  .word 0x8ba79b98
-  .word 0x6e1d3b62
-  .word 0xf320ad74
-  .word 0x8eb1c71e
-  .word 0xbe8b0537
-  .word 0xaa87ca22
-  .zero 16
-
-/* P-384 basepoint G affine y-coordinate */
-p384_gy:
-  .word 0x90ea0e5f
-  .word 0x7a431d7c
-  .word 0x1d7e819d
-  .word 0x0a60b1ce
-  .word 0xb5f0b8c0
-  .word 0xe9da3113
-  .word 0x289a147c
-  .word 0xf8f41dbd
-  .word 0x9292dc29
-  .word 0x5d9e98bf
-  .word 0x96262c6f
-  .word 0x3617de4a
-  .zero 16
-
-/* 704 bytes of scratchpad memory */
-scratchpad:
-  .zero 704
diff --git a/sw/otbn/code-snippets/p384_base.s b/sw/otbn/code-snippets/p384_base.s
new file mode 100644
index 0000000..f096167
--- /dev/null
+++ b/sw/otbn/code-snippets/p384_base.s
@@ -0,0 +1,832 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ *   This library contains:
+ *   - P-384 specific routines for point addition in projective space
+ *   - P-384 domain parameters
+ */
+
+ .section .text
+
+/**
+ * 384-bit modular multiplication based on Barrett reduction algorithm
+ * optimized for the special modulus of the NIST P-384 curve.
+ *
+ * Returns c = a x b % p.
+ *
+ * Expects: two operands, modulus p and pre-calculated parameter u for barrett
+ * reduction (usually greek mu in literature). u is expected without the
+ * leading 1 at bit 384. u has to be pre-calculated as u = floor(2^768/p).
+ *
+ * This implementation mostly follows the description in the
+ * "Handbook of Applied Cryptography" in Algorithm 14.42.
+ * Differences:
+ *   - This implementation incorporates a multiplication before the reduction.
+ *     Therefore it expects two operands (a, b) instead of a wider integer x.
+ *   - The computation of q2 ignores the MSbs of q1 and u to allow using
+ *     a 384x384 bit multiplication. This is compensated later by
+ *     individual (conditional) additions.
+ *   - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
+ *     are not implemented here and the full register width is used. This
+ *     allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] [w11, w10]: a, first operand, max. length 384 bit, a < m.
+ * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
+ * @param[in] [w13, w12]: p, modulus of P384 i.e.:
+                             m = 2^384 - 2^128 - 2^96 + 2^32 - 1.
+ * @param[in] [w15, w14]: u, pre-computed Barrett constant (without u[384]/MSb
+ *                           of u which is always 1 for the allowed range but
+ *                           has to be set to 0 here).
+ * @param[in] w31: all-zero.
+ * @param[out] [w17, w16]: c, result, max. length 384 bit.
+ *
+ * Clobbered registers: w10, w11, w16, w17, w18, w19, w20, w21, w22, w23, w24
+ * Clobbered flag groups: FG0
+ */
+ .globl barrett384_p384
+barrett384_p384:
+  /* Compute the integer product of the operands x = a * b
+     x = [w18, w22, w21] = a * b = [w11, w10] * [w17, w16]
+     => max. length x: 768 bit */
+  bn.mulqacc.z          w10.0, w16.0,   0
+  bn.mulqacc            w10.0, w16.1,  64
+  bn.mulqacc.so w21.L,  w10.1, w16.0,  64
+  bn.mulqacc            w10.0, w16.2,   0
+  bn.mulqacc            w10.1, w16.1,   0
+  bn.mulqacc            w10.2, w16.0,   0
+  bn.mulqacc            w10.0, w16.3,  64
+  bn.mulqacc            w10.1, w16.2,  64
+  bn.mulqacc            w10.2, w16.1,  64
+  bn.mulqacc.so w21.U,  w10.3, w16.0,  64
+  bn.mulqacc            w10.0, w17.0,   0
+  bn.mulqacc            w10.1, w16.3,   0
+  bn.mulqacc            w10.2, w16.2,   0
+  bn.mulqacc            w10.3, w16.1,   0
+  bn.mulqacc            w11.0, w16.0,   0
+  bn.mulqacc            w10.0, w17.1,  64
+  bn.mulqacc            w10.1, w17.0,  64
+  bn.mulqacc            w10.2, w16.3,  64
+  bn.mulqacc            w10.3, w16.2,  64
+  bn.mulqacc            w11.0, w16.1,  64
+  bn.mulqacc.so w22.L,  w11.1, w16.0,  64
+  bn.mulqacc            w10.1, w17.1,   0
+  bn.mulqacc            w10.2, w17.0,   0
+  bn.mulqacc            w10.3, w16.3,   0
+  bn.mulqacc            w11.0, w16.2,   0
+  bn.mulqacc            w11.1, w16.1,   0
+  bn.mulqacc            w10.2, w17.1,  64
+  bn.mulqacc            w10.3, w17.0,  64
+  bn.mulqacc            w11.0, w16.3,  64
+  bn.mulqacc.so w22.U,  w11.1, w16.2,  64
+  bn.mulqacc            w10.3, w17.1,   0
+  bn.mulqacc            w11.0, w17.0,   0
+  bn.mulqacc            w11.1, w16.3,   0
+  bn.mulqacc            w11.0, w17.1,  64
+  bn.mulqacc.so w18.L,  w11.1, w17.0,  64
+  bn.mulqacc.so w18.U,  w11.1, w17.1,   0
+
+  /* Store correction factor to compensate for later neglected MSb of x.
+     x is 768 bit wide and therefore the 383 bit right shifted version q1
+     (below) contains 385 bit. Bit 384 of q1 is neglected to allow using a
+     384x384 multiplier. For the MSb of x being set we temporary store u
+     (or zero) here to be used in a later constant time correction of a
+     multiplication with u. Note that this requires the MSb flag being carried
+     over from the multiplication routine. */
+  bn.sel w23, w14, w31, M
+  bn.sel w24, w15, w31, M
+
+  /* Compute q1 = x >> 383
+     q1 = [w11, w10] = [w18, w22, w21] >> 383  = [w18, w21] >> 127
+     => max length q1: 385 bits */
+  bn.rshi w11, w31, w18 >> 127
+  bn.rshi w10, w18, w22 >> 127
+
+  /* Compute q2 = q1*u
+     Instead of full q2 (which would be up to 770 bits) we ignore the MSb of u
+     and the MSb of q1 and correct this later. This allows using a 384x384
+     multiplier. We use the property that u for the modulus of P384 is zero in
+     the bits 383 downto 129 and use a 384x192 multiplication routine.
+     => max. length q2': 513 bit
+     q2' = q1[383:0]*u[128:0] = [w18, w17, w16] = [w11, w10] * [w15, w14] */
+
+  /* 576 = 384*192 bit multiplication kernel */
+  bn.mulqacc.z          w10.0, w14.0,   0
+  bn.mulqacc            w10.0, w14.1,  64
+  bn.mulqacc.so w16.L,  w10.1, w14.0,  64
+  bn.mulqacc            w10.0, w14.2,   0
+  bn.mulqacc            w10.1, w14.1,   0
+  bn.mulqacc            w10.2, w14.0,   0
+  bn.mulqacc            w10.1, w14.2,  64
+  bn.mulqacc            w10.2, w14.1,  64
+  bn.mulqacc.so w16.U,  w10.3, w14.0,  64
+  bn.mulqacc            w10.2, w14.2,   0
+  bn.mulqacc            w10.3, w14.1,   0
+  bn.mulqacc            w11.0, w14.0,   0
+  bn.mulqacc            w10.3, w14.2,  64
+  bn.mulqacc            w11.0, w14.1,  64
+  bn.mulqacc.so w17.L,  w11.1, w14.0,  64
+  bn.mulqacc            w11.0, w14.2,   0
+  bn.mulqacc            w11.1, w14.1,   0
+  bn.mulqacc.so w17.U,  w11.1, w14.2,  64
+
+  /* w14.3 is always zero here due to structure of Barrett constant */
+  bn.mulqacc.wo w18,    w11.1, w14.3,  64
+
+  /* q3 = q2 >> 385
+     In this step, the compensation for the neglected MSbs of q1 and u is
+     carried out along the way. To add them in the q2 domain, they would have
+     to be left shifted by 384 bit first. To directly add them we first shift
+     q2' by 384 bit to the right, perform the additions, and shift the result
+     another bit to the right. The additions cannot overflow due to leading
+     zeros after shift.
+     q2'' = q2' >> 384 = [w20, w19] = [w18, w17, w16] >> 384
+                                    = [w18, w17] >> 128 */
+  bn.rshi w20, w31, w18 >> 128
+  bn.rshi w19, w18, w17 >> 128
+  /* Add q1. This is unconditional since MSb of u is always 1.
+     This cannot overflow due to leading zeros.
+     q2''' = q2'' + q1 = [w20, w19] = [w20, w19] + [w10, w11] */
+  bn.add w19, w19, w10
+  bn.addc w20, w20, w11
+  /* Conditionally add u (without leading 1) in case of MSb of x being set.
+     This is the "real" q2 but shifted by 384 bits to the right. This cannot
+     overflow due to leading zeros.
+     q2'''' = x[767]?q2'''+u[383:0]:q2'''
+            = [w20, w19] + [w24, w23] = q2 >> 384 */
+  bn.add w19, w19, w23
+  bn.addc w20, w20, w24
+  /* finally this gives q3 by shifting the remaining bit to the right
+     q3 = q2 >> 385 = q2'''' >> 1 = [w11, w10] = [w20, w19] >> 1 */
+  bn.rshi w11, w31, w20 >> 1
+  bn.rshi w10, w20, w19 >> 1
+
+  /* r2 = q3*m[511:0] = [w17, w16] = ([w11, w10] * [w13, w12])[511:0]
+     A 384x384 bit multiplication kernel is used here, hence both q3 and p
+     must not be wider than 384 bit. This is always the case for p. For q3 it
+     is the case if a<p and b<p.
+     The 256 highest bits of the multiplication result are not needed,
+     so we do not compute them. */
+  bn.mulqacc.z          w10.0, w12.0,   0
+  bn.mulqacc            w10.0, w12.1,  64
+  bn.mulqacc.so w16.L,  w10.1, w12.0,  64
+  bn.mulqacc            w10.0, w12.2,   0
+  bn.mulqacc            w10.1, w12.1,   0
+  bn.mulqacc            w10.2, w12.0,   0
+  bn.mulqacc            w10.0, w12.3,  64
+  bn.mulqacc            w10.1, w12.2,  64
+  bn.mulqacc            w10.2, w12.1,  64
+  bn.mulqacc.so w16.U,  w10.3, w12.0,  64
+  bn.mulqacc            w10.0, w13.0,   0
+  bn.mulqacc            w10.1, w12.3,   0
+  bn.mulqacc            w10.2, w12.2,   0
+  bn.mulqacc            w10.3, w12.1,   0
+  bn.mulqacc            w11.0, w12.0,   0
+  bn.mulqacc            w10.0, w13.1,  64
+  bn.mulqacc            w10.1, w13.0,  64
+  bn.mulqacc            w10.2, w12.3,  64
+  bn.mulqacc            w10.3, w12.2,  64
+  bn.mulqacc            w11.0, w12.1,  64
+  bn.mulqacc.so w17.L,  w11.1, w12.0,  64
+  bn.mulqacc            w10.1, w13.1,   0
+  bn.mulqacc            w10.2, w13.0,   0
+  bn.mulqacc            w10.3, w12.3,   0
+  bn.mulqacc            w11.0, w12.2,   0
+  bn.mulqacc            w11.1, w12.1,   0
+  bn.mulqacc            w10.2, w13.1,  64
+  bn.mulqacc            w10.3, w13.0,  64
+  bn.mulqacc            w11.0, w12.3,  64
+  bn.mulqacc.so w17.U,  w11.1, w12.2,  64
+
+  /* Compute r = x-r2 = x-q3*p
+     since 0 <= r < 3*p, we only need to consider the lower limbs of x and r2
+     r[511:0] = [w22, w21] - [w17, w16] */
+  bn.sub w21, w21, w16
+  bn.subb w22, w22, w17
+
+  /* Barrett algorithm requires subtraction of the modulus at most two times if
+     result is too large. However in the special case of P-384 we need to
+     subtract only once */
+  bn.sub w16, w21, w12
+  bn.subb w17, w22, w13
+  bn.sel w16, w21, w16, C
+  bn.sel w17, w22, w17, C
+
+  /* return result: c =[w17, w16] =  a * b % p. */
+  ret
+
+
+/**
+ * P-384 point addition in projective space
+ *
+ * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q)
+ *         with R, P and Q being valid P-384 curve points
+ *           in projective coordinates
+ *
+ * This routine adds two valid P-384 curve points in projective space.
+ * Point addition is performed based on the complete formulas of Bosma and
+ * Lenstra for Weierstrass curves as first published in [1] and
+ * optimized in [2].
+ * The implemented version follows Algorithm 4 of [2] which is an optimized
+ * variant for Weierstrass curves with domain parameter 'a' set to a=-3.
+ * Numbering of the steps below and naming of symbols follows the
+ * terminology of Algorithm 4 of [2].
+ * The routine is limited to P-384 curve points due to:
+ *   - fixed a=-3 domain parameter
+ *   - usage of a P-384 optimized Barrett multiplication kernel
+ * This routine runs in constant time.
+ *
+ * [1] https://doi.org/10.1006/jnth.1995.1088
+ * [2] https://doi.org/10.1007/978-3-662-49890-3_16
+ *
+ * @param[in]  x22: set to 10, pointer to in reg for Barrett routine
+ * @param[in]  x23: set to 11, pointer to in reg for Barrett routine
+ * @param[in]  x24: set to 16, pointer to in/out reg for Barrett routine
+ * @param[in]  x25: set to 17, pointer to in/out reg for Barrett routine
+ * @param[in]  x26: dptr_p_p, dmem pointer to point P in dmem (projective)
+ * @param[in]  x27: dptr_q_p, dmem pointer to point Q in dmem (projective)
+ * @param[in]  x28: dptr_b, dmem pointer to domain parameter b of P-384 in dmem
+ * @param[in]  [w13, w12]: p, modulus of underlying field of P-384
+ * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
+ *                                    modulus p
+ * @param[in]  w31: all-zero.
+ * @param[out]  [w26, w25]: x_r, x-coordinate of resulting point R
+ * @param[out]  [w28, w27]: y_r, y-coordinate of resulting point R
+ * @param[out]  [w30, w29]: z_r, z-coordinate of resulting point R
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: w0 to w30
+ * clobbered flag groups: FG0
+ */
+.globl proj_add_p384
+proj_add_p384:
+  /* mapping of parameters to symbols of [2] (Algorithm 4):
+     X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q
+     X3 = x_r; Y3 = y_r; Z3 = z_r */
+
+  /* 1: [w1, w0] = t0 <= X1*X2 = dmem[x26+0]*dmem[x27+0] */
+  bn.lid    x22, 0(x26)
+  bn.lid    x23, 32(x26)
+  bn.lid    x24, 0(x27)
+  bn.lid    x25, 32(x27)
+  jal       x1, barrett384_p384
+  bn.mov    w0, w16
+  bn.mov    w1, w17
+
+  /* 2: [w3, w2] = t1 <= Y1*Y2 = dmem[x26+64]*dmem[x27+64] */
+  bn.lid    x22, 64(x26)
+  bn.lid    x23, 96(x26)
+  bn.lid    x24, 64(x27)
+  bn.lid    x25, 96(x27)
+  jal       x1, barrett384_p384
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* 3: [w5, w4] = t2 <= Z1*Z2 = dmem[x26+128]*dmem[x27+128] */
+  bn.lid    x22, 128(x26)
+  bn.lid    x23, 160(x26)
+  bn.lid    x24, 128(x27)
+  bn.lid    x25, 160(x27)
+  jal       x1, barrett384_p384
+  bn.mov    w4, w16
+  bn.mov    w5, w17
+
+  /* 4: [w7, w6] = t3 <= X1+Y1 = dmem[x26+0]+dmem[x26+64] */
+  bn.lid    x22, 0(x26)
+  bn.lid    x23, 32(x26)
+  bn.lid    x24, 64(x26)
+  bn.lid    x25, 96(x26)
+  bn.add    w16, w10, w16
+  bn.addc   w17, w11, w17
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */
+  bn.lid    x22, 0(x27)
+  bn.lid    x23, 32(x27)
+  bn.lid    x24, 64(x27)
+  bn.lid    x25, 96(x27)
+  bn.add    w16, w10, w16
+  bn.addc   w17, w11, w17
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w8, w16
+  bn.mov    w9, w17
+
+  /* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  bn.mov    w16, w8
+  bn.mov    w17, w9
+  jal       x1, barrett384_p384
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* 7: [w9, w8] = t4 <= t0+t1 = [w1, w0]+[w3, w2] */
+  bn.add    w16, w0, w2
+  bn.addc   w17, w1, w3
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w8, w16
+  bn.mov    w9, w17
+
+  /* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */
+  bn.sub    w16, w6, w8
+  bn.subb   w17, w7, w9
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */
+  bn.lid    x22, 64(x26)
+  bn.lid    x23, 96(x26)
+  bn.lid    x24, 128(x26)
+  bn.lid    x25, 160(x26)
+  bn.add    w16, w10, w16
+  bn.addc   w17, w11, w17
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w8, w16
+  bn.mov    w9, w17
+
+  /* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */
+  bn.lid    x22, 64(x27)
+  bn.lid    x23, 96(x27)
+  bn.lid    x24, 128(x27)
+  bn.lid    x25, 160(x27)
+  bn.add    w16, w10, w16
+  bn.addc   w17, w11, w17
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */
+  bn.mov    w10, w8
+  bn.mov    w11, w9
+  bn.mov    w16, w25
+  bn.mov    w17, w26
+  jal       x1, barrett384_p384
+  bn.mov    w8, w16
+  bn.mov    w9, w17
+
+  /* 12: [w26, w25] = X3 <= t1+t2 = [w3, w2]+[w5, w4] */
+  bn.add    w16, w2, w4
+  bn.addc   w17, w3, w5
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */
+  bn.sub    w16, w8, w25
+  bn.subb   w17, w9, w26
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w8, w16
+  bn.mov    w9, w17
+
+  /* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */
+  bn.lid    x22, 0(x26)
+  bn.lid    x23, 32(x26)
+  bn.lid    x24, 128(x26)
+  bn.lid    x25, 160(x26)
+  bn.add    w16, w10, w16
+  bn.addc   w17, w11, w17
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */
+  bn.lid    x22, 0(x27)
+  bn.lid    x23, 32(x27)
+  bn.lid    x24, 128(x27)
+  bn.lid    x25, 160(x27)
+  bn.add    w16, w10, w16
+  bn.addc   w17, w11, w17
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */
+  bn.mov    w10, w25
+  bn.mov    w11, w26
+  bn.mov    w16, w27
+  bn.mov    w17, w28
+  jal       x1, barrett384_p384
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 17: [w28, w27] = Y3 <= t0+t2 = [w1, w0]+[w5, w4] */
+  bn.add    w16, w0, w4
+  bn.addc   w17, w1, w5
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */
+  bn.sub    w16, w25, w27
+  bn.subb   w17, w26, w28
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */
+  bn.lid    x22, 0(x28)
+  bn.lid    x23, 32(x28)
+  bn.mov    w16, w4
+  bn.mov    w17, w5
+  jal       x1, barrett384_p384
+  bn.mov    w29, w16
+  bn.mov    w30, w17
+
+  /* 20: [w26, w25] = X3 <= Y3-Z3 = [w28, w27]-[w30, w29] */
+  bn.sub    w16, w27, w29
+  bn.subb   w17, w28, w30
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */
+  bn.add    w16, w25, w25
+  bn.addc   w17, w26, w26
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w29, w16
+  bn.mov    w30, w17
+
+  /* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */
+  bn.add    w16, w25, w29
+  bn.addc   w17, w26, w30
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */
+  bn.sub    w16, w2, w25
+  bn.subb   w17, w3, w26
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w29, w16
+  bn.mov    w30, w17
+
+  /* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */
+  bn.add    w16, w2, w25
+  bn.addc   w17, w3, w26
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */
+  bn.lid    x22, 0(x28)
+  bn.lid    x23, 32(x28)
+  bn.mov    w16, w27
+  bn.mov    w17, w28
+  jal       x1, barrett384_p384
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 26: [w3, w2] = t1 <= t2+t2 = [w5, w4]+[w5, w4] */
+  bn.add    w16, w4, w4
+  bn.addc   w17, w5, w5
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */
+  bn.add    w16, w2, w4
+  bn.addc   w17, w3, w5
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w4, w16
+  bn.mov    w5, w17
+
+  /* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */
+  bn.sub    w16, w27, w4
+  bn.subb   w17, w28, w5
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */
+  bn.sub    w16, w27, w0
+  bn.subb   w17, w28, w1
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */
+  bn.add    w16, w27, w27
+  bn.addc   w17, w28, w28
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */
+  bn.add    w16, w2, w27
+  bn.addc   w17, w3, w28
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */
+  bn.add    w16, w0, w0
+  bn.addc   w17, w1, w1
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */
+  bn.add    w16, w2, w0
+  bn.addc   w17, w3, w1
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w0, w16
+  bn.mov    w1, w17
+
+  /* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */
+  bn.sub    w16, w0, w4
+  bn.subb   w17, w1, w5
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w0, w16
+  bn.mov    w1, w17
+
+  /* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */
+  bn.mov    w10, w8
+  bn.mov    w11, w9
+  bn.mov    w16, w27
+  bn.mov    w17, w28
+  jal       x1, barrett384_p384
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* 36: [w5, w4] = t2 <= t0*Y3 = [w1, w0]*[w28, w27] */
+  bn.mov    w10, w0
+  bn.mov    w11, w1
+  bn.mov    w16, w27
+  bn.mov    w17, w28
+  jal       x1, barrett384_p384
+  bn.mov    w4, w16
+  bn.mov    w5, w17
+
+  /* 37: [w28, w27] = Y3 <= X3*Z3 = [w26, w25]*[w30, w29] */
+  bn.mov    w10, w25
+  bn.mov    w11, w26
+  bn.mov    w16, w29
+  bn.mov    w17, w30
+  jal       x1, barrett384_p384
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 38: [w28, w27] = Y3 <= Y3+t2 = [w28, w27]+[w5, w4] */
+  bn.add    w16, w27, w4
+  bn.addc   w17, w28, w5
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  bn.mov    w16, w25
+  bn.mov    w17, w26
+  jal       x1, barrett384_p384
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 40: [w26, w25] = X3 <= X3-t1 = [w26, w25]-[w3, w2] */
+  bn.sub    w16, w25, w2
+  bn.subb   w17, w26, w3
+  bn.add    w10, w16, w12
+  bn.addc   w11, w17, w13
+  bn.sel    w16, w10, w16, C
+  bn.sel    w17, w11, w17, C
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */
+  bn.mov    w10, w8
+  bn.mov    w11, w9
+  bn.mov    w16, w29
+  bn.mov    w17, w30
+  jal       x1, barrett384_p384
+  bn.mov    w29, w16
+  bn.mov    w30, w17
+
+  /* 42: [w3, w2] = t1 <= t3*t0 = [w7, w6]*[w1, w0] */
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  bn.mov    w16, w0
+  bn.mov    w17, w1
+  jal       x1, barrett384_p384
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* 43: [w30, w29] = Z3 <= Z3+t1 = [w30, w29]+[w3, w2] */
+  bn.add    w16, w29, w2
+  bn.addc   w17, w30, w3
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+  bn.mov    w29, w16
+  bn.mov    w30, w17
+
+  ret
+
+
+.section .data
+
+/* P-384 domain parameter b */
+.globl p384_b
+p384_b:
+  .word 0xd3ec2aef
+  .word 0x2a85c8ed
+  .word 0x8a2ed19d
+  .word 0xc656398d
+  .word 0x5013875a
+  .word 0x0314088f
+  .word 0xfe814112
+  .word 0x181d9c6e
+  .word 0xe3f82d19
+  .word 0x988e056b
+  .word 0xe23ee7e4
+  .word 0xb3312fa7
+  .zero 16
+
+/* P-384 domain parameter p (modulus) */
+.globl p384_p
+p384_p:
+  .word 0xffffffff
+  .word 0x00000000
+  .word 0x00000000
+  .word 0xffffffff
+  .word 0xfffffffe
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .zero 16
+
+/* barrett constant u for modulus p */
+.globl p384_u_p
+p384_u_p:
+  .word 0x00000001
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0x00000000
+  .word 0x00000001
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .zero 16
+
+/* P-384 domain parameter n (order of base point) */
+.globl p384_n
+p384_n:
+  .word 0xccc52973
+  .word 0xecec196a
+  .word 0x48b0a77a
+  .word 0x581a0db2
+  .word 0xf4372ddf
+  .word 0xc7634d81
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .word 0xffffffff
+  .zero 16
+
+/* barrett constant u for n */
+.globl p384_u_n
+p384_u_n:
+  .word 0x333ad68d
+  .word 0x1313e695
+  .word 0xb74f5885
+  .word 0xa7e5f24d
+  .word 0x0bc8d220
+  .word 0x389cb27e
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .zero 16
+
+/* P-384 basepoint G affine x-coordinate */
+.globl p384_gx
+p384_gx:
+  .word 0x72760ab7
+  .word 0x3a545e38
+  .word 0xbf55296c
+  .word 0x5502f25d
+  .word 0x82542a38
+  .word 0x59f741e0
+  .word 0x8ba79b98
+  .word 0x6e1d3b62
+  .word 0xf320ad74
+  .word 0x8eb1c71e
+  .word 0xbe8b0537
+  .word 0xaa87ca22
+  .zero 16
+
+/* P-384 basepoint G affine y-coordinate */
+.globl p384_gy
+p384_gy:
+  .word 0x90ea0e5f
+  .word 0x7a431d7c
+  .word 0x1d7e819d
+  .word 0x0a60b1ce
+  .word 0xb5f0b8c0
+  .word 0xe9da3113
+  .word 0x289a147c
+  .word 0xf8f41dbd
+  .word 0x9292dc29
+  .word 0x5d9e98bf
+  .word 0x96262c6f
+  .word 0x3617de4a
+  .zero 16
diff --git a/sw/otbn/code-snippets/p384_sign.s b/sw/otbn/code-snippets/p384_sign.s
new file mode 100644
index 0000000..e72f2d5
--- /dev/null
+++ b/sw/otbn/code-snippets/p384_sign.s
@@ -0,0 +1,1011 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ *   P-384 specific routines for ECDSA signature generation and constant-time
+ *   scalar multiplication.
+ */
+
+ .section .text
+
+/**
+ * Convert projective coordinates of a P-384 curve point to affine coordinates
+ *
+ * returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
+ *              where P is a valid P-384 curve point,
+ *                    x_a and y_a are the resulting affine coordinates of the
+ *                      curve point,
+ *                    x,y and z are a set of projective coordinates of the
+ *                      point and
+ *                    p is the modulus of the P-384 underlying finite field.
+ *
+ * This routine computes the affine coordinates for a set of projective
+ * coordinates of a valid P-384 curve point. The routine performs the required
+ * divisions by computing the multiplicative modular inverse of the
+ * projective z-coordinate in the underlying finite field of the P-384 curve.
+ * For inverse computation Fermat's little theorem is used, i.e.
+ * we compute z^-1 = z^(p-2) mod p.
+ * For exponentiation a 16 step addition chain is used.
+ * Source of the addition chain is the addchain project:
+ * https://github.com/mmcloughlin/addchain/
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  [w26,w25]: x, x-coordinate of curve point (projective).
+ * @param[in]  [w26,w25]: y, y-coordinate of curve point (projective).
+ * @param[in]  [w30,w29]: z, z-coordinate of curve point (projective).
+ * @param[in]  [w13, w12]: p, modulus of P-384.
+ * @param[in]  [w15, w14]: u, pre-computed Barrett constant for p,
+ *                            lower 384 bits, i.e. (2^(2*384) div p)[383:0].
+ * @param[in] w31: all-zero.
+ * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
+ * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
+ *
+ * clobbered registers: w0 to w28
+ * clobbered flag groups: FG0
+ */
+proj_to_affine_p384:
+
+  /* Exp: 0b10 = 2*0b1
+     Val: r10 = z^2 mod p
+          [w17,w16] <= [w30,w29]^2 mod [w13,w12] */
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  bn.mov    w16, w29
+  bn.mov    w17, w30
+  jal       x1, barrett384_p384
+
+  /* Exp: 0b11 = 0b1+0b10
+     Val: r11 <= z*r10 mod p
+          [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, barrett384_p384
+
+  /* Exp: 0b110 = 2*0b11
+     Val: r110 = r11^2 mod p
+          [w17,w16] <= [w17,w16]^2 mod [w13,w12] */
+  bn.mov    w10, w16
+  bn.mov    w11, w17
+  jal       x1, barrett384_p384
+
+  /* Exp: 0b111 = 0b1+0b110
+     Val: r111 <= z*r110  mod p
+          [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, barrett384_p384
+  bn.mov    w0, w16
+  bn.mov    w1, w17
+
+  /* Exp: 0b111000 = 0b111<<3
+     Val: r111000 <= r111^(2^3)  mod p
+          [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
+  loopi     3, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+
+  /* Exp: 0b1111111 = 0b111+0b111000
+     Val: r1111111 <= r111*r111000 mod p
+          [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
+  bn.mov    w10, w0
+  bn.mov    w11, w1
+  jal       x1, barrett384_p384
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* Exp: 2^12-1 = (0b1111111<<6)+0b111111
+     Val: r_12_1 <= r111111^(2^6)*r111111 mod p
+          [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
+  loopi     6, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w2
+  bn.mov    w11, w3
+  jal       x1, barrett384_p384
+  bn.mov    w4, w16
+  bn.mov    w5, w17
+
+  /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
+     Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
+          [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
+  loopi     12, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w4
+  bn.mov    w11, w5
+  jal       x1, barrett384_p384
+
+  /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
+     Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
+          [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
+  loopi     6, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w2
+  bn.mov    w11, w3
+  jal       x1, barrett384_p384
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* Exp: 2^31-1 <= (2^30-1)*2+0b1
+     Val: r_31_1 <= r30_1^2*z mod p
+          [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
+  bn.mov    w10, w16
+  bn.mov    w11, w17
+  jal       x1, barrett384_p384
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, barrett384_p384
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* Exp: 2^32-1 <= (2^30-1)*2+0b1
+     Val: r_32_1 <= r31_1^2*z mod p
+          [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
+  bn.mov    w10, w16
+  bn.mov    w11, w17
+  jal       x1, barrett384_p384
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, barrett384_p384
+  bn.mov    w9, w16
+  bn.mov    w8, w17
+
+  /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
+     Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
+          [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
+  loopi     31, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  jal       x1, barrett384_p384
+  bn.mov    w6, w16
+  bn.mov    w7,w17
+
+  /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
+     Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
+          [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
+  loopi     63, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  jal       x1, barrett384_p384
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
+     Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
+          [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
+  loopi     126, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  jal       x1, barrett384_p384
+
+  /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
+     Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
+          [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
+  loopi     3, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w0
+  bn.mov    w11, w1
+  jal       x1, barrett384_p384
+
+  /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
+     Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
+          [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
+                       *[w30,w29] mod [w13,w12] */
+  loopi     33, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w9
+  bn.mov    w11, w8
+  jal       x1, barrett384_p384
+  loopi     94, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w2
+  bn.mov    w11, w3
+  jal       x1, barrett384_p384
+  loopi     2, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+    nop
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, barrett384_p384
+
+  /* store inverse [w1,w0] <= [w17,w16] = z_inv*/
+  bn.mov w0, w16
+  bn.mov w1, w17
+
+  /* convert x-coordinate to affine space
+     [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
+  bn.mov    w10, w25
+  bn.mov    w11, w26
+  jal       x1, barrett384_p384
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* convert y-coordinate to affine space
+     [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
+  bn.mov    w10, w27
+  bn.mov    w11, w28
+  bn.mov    w16, w0
+  bn.mov    w17, w1
+  jal       x1, barrett384_p384
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  ret
+
+
+/**
+ * Fetch curve point from dmem, randomize z-coordinate and store point in dmem
+ *
+ * returns P = (x, y, z) = (x_a*z, y_a*z, z)
+ *         with P being a valid P-384 curve point in projective coordinates
+ *              x_a and y_a being the affine coordinates as fetched from dmem
+ *              z being a randomized z-coordinate
+ *
+ * This routines fetches the affine x- and y-coordinates of a curve point from
+ * dmem and computes a valid set of projective coordinates. The z-coordinate is
+ * randomized and x and y are scaled appropriately. The resulting projective
+ * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
+ * i.e. each coordinate is stored 512 bit aligned, little endian.
+ * This routine runs in constant time.
+ *
+ * @param[in]  x20: dptr_x, pointer to dmem location containing affine
+ *                          x-coordinate of input point
+ * @param[in]  x21: dptr_y, pointer to dmem location containing affine
+ *                          y-coordinate of input point
+ * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
+ *                                    modulus p
+ * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
+ * @param[in]  w31: all-zero
+ * @param[in]  x18: dptr_p_p, pointer to dmem location to store resulting point
+ *                            in projective space
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ *        the upper limb of projective y-coordinate.
+ *
+ * clobbered registers: x10, x11 to x13
+  *                     w2, w3, w8 to w11, w16 to w24, w29, w30
+ * clobbered flag groups: FG0
+ */
+store_proj_randomize:
+
+  /* get a 384-bit random number
+    [w3, w2] = random(384) */
+  bn.wsrr   w2, 1
+  bn.wsrr   w3, 1
+  bn.rshi   w3, w31, w3 >> 128
+
+  /* reduce random number
+     [w2, w3] = z <= [w2, w3] mod p */
+  bn.sub   w10, w2, w12
+  bn.subb  w11, w3, w13
+  bn.sel   w2, w2, w10, C
+  bn.sel   w3, w3, w11, C
+
+  bn.mov w10, w2
+  bn.mov w11, w3
+
+  /* store z-coordinate
+     dmem[x20+128] = [w10, w11] */
+  li        x10, 10
+  li        x11, 11
+  bn.sid    x10, 128(x18)
+  bn.sid    x11, 160(x18)
+
+  /* fetch x-coordinate from dmem
+     [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
+  li x12, 16
+  li x13, 17
+  bn.lid    x12,  0(x20)
+  bn.lid    x13, 32(x20)
+
+  /* scale and store x-coordinate
+     [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
+       x_p <= [w11, w10] * [w17, w16] = z*x  mod p */
+
+  jal       x1, barrett384_p384
+  bn.sid    x12,  0(x18)
+  bn.sid    x13, 32(x18)
+
+  /* fetch y-coordinate from dmem
+     [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
+  bn.lid    x12,  0(x21)
+  bn.lid    x13, 32(x21)
+
+  /* scale and store y-coordinate
+     [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
+       y_p <= [w11, w10] * [w17, w16] = z*y  mod p */
+  bn.mov w10, w2
+  bn.mov w11, w3
+  jal       x1, barrett384_p384
+  bn.sid    x12, 64(x18)
+  bn.sid    x13, 96(x18)
+
+  ret
+
+
+/**
+ * P-384 scalar point multiplication in affine space
+ *
+ * returns R = k*P = k*(x_p, y_p)
+ *         where R, P are valid P-384 curve points in affine coordinates,
+ *               k is a 384-bit scalar.
+ *
+ * This routine performs scalar multiplication based on the group laws
+ * of Weierstrass curves.
+ * A constant time double-and-add algorithm (sometimes referred to as
+ * double-and-add-always) is used.
+ * Due to the P-384 optimized implementations of the internally called routines
+ * for point addition and doubling, this routine is limited to P-384 curves.
+ * The routine makes use of blinding by additive splitting the
+ * exponent/scalar d into two shares. The double-and-add loop operates on both
+ * shares in parallel applying Shamir's trick.
+ *
+ * @param[in]  x9: dptr_rnd, pointer to location in dmem containing random
+ *                           number to be used for additive splitting of scalar
+ * @param[in]  x19: dptr_k, pointer to scalar k (0 < k < n) in dmem
+ * @param[in]  x20: dptr_x, pointer to affine x-coordinate in dmem
+ * @param[in]  x21: dptr_y, pointer to affine y-coordinate in dmem
+ * @param[in]  x28: dptr_b, pointer to domain parameter b of P-384 in dmem
+ * @param[in]  x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
+ * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u
+ *                                  corresponding to modulus p
+ * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
+ * @param[in]  [w11, w10]: n, domain parameter of P-384 curve
+ *                            (order of basepoint G)
+ * @param[in]  w31: all-zero
+ * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R.
+ * @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R.
+ *
+ * Scratchpad memory layout:
+ * The routine expects at least 704 bytes of scratchpad memory at dmem
+ * location 'scratchpad' (sp). Internally the scratchpad is used as follows:
+ * dptr_sp     .. dptr_sp+191: point P, projective
+ * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
+ * dptr_sp+256 .. dptr_sp+447: point 2P, projective
+ * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
+ * dptr_sp+512 .. dptr_sp+703: point Q, projective
+ *
+ * Projective coordinates of a point are kept in dmem in little endian format
+ * with the individual coordinates 512 bit aligned. The coordinates are stored
+ * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
+ * curve point occupies 6 consecutive 256-bit dmem cells.
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ *        the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
+ * clobbered flag groups: FG0
+ */
+scalar_mult_int_p384:
+
+  /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
+     resetting in very call to point addition routine */
+  li        x22, 10
+  li        x23, 11
+  li        x24, 16
+  li        x25, 17
+
+  /* fetch externally supplied random number from dmem
+     [w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */
+  li        x2, 0
+  bn.lid    x2++, 0(x9)
+  bn.lid    x2++, 32(x9)
+
+  /* 1st share (reduced rnd)
+     s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */
+  bn.sub    w9, w0, w10
+  bn.subb   w8, w1, w11
+  bn.sel    w0, w0, w9, C
+  bn.sel    w1, w1, w8, C
+
+  /* load scalar k from dmem
+     [w3, w2] = k <= dmem[dptr_k] = [dmem[x19], dmem[x19+32]] */
+  bn.lid    x2++, 0(x19)
+  bn.lid    x2, 32(x19)
+
+  /* 2nd share (k-s0)
+     s1 = [w3, w2] <= k - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */
+  bn.sub    w2, w2, w0
+  bn.subb   w3, w3, w1
+  bn.add    w8, w2, w10
+  bn.addc   w9, w3, w11
+  bn.sel    w2, w8, w2, C
+  bn.sel    w3, w9, w3, C
+
+  /* left align both shares for probing of MSB in loop body */
+  bn.rshi   w1, w1, w0 >> 128
+  bn.rshi   w0, w0, w31 >> 128
+  bn.rshi   w3, w3, w2 >> 128
+  bn.rshi   w2, w2, w31 >> 128
+
+   /* store shares in scratchpad */
+  li        x2, 0
+  bn.sid    x2++, 192(x30)
+  bn.sid    x2++, 224(x30)
+  bn.sid    x2++, 448(x30)
+  bn.sid    x2++, 480(x30)
+
+  /* get randomized projective coodinates of curve point
+     P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
+  add       x18, x30, 0
+  jal       x1, store_proj_randomize
+
+  /* double point P
+     2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
+  add       x27, x30, x0
+  add       x26, x30, x0
+  jal       x1, proj_add_p384
+
+  /* store point 2P in scratchpad @w30+256
+     dmem[dptr_sc+256] = [w30:w25] = 2P */
+  li        x2, 25
+  bn.sid    x2++, 256(x30)
+  bn.sid    x2++, 288(x30)
+  bn.sid    x2++, 320(x30)
+  bn.sid    x2++, 352(x30)
+  bn.sid    x2++, 384(x30)
+  bn.sid    x2++, 416(x30)
+
+  /* init point Q = (0,1,0) for double-and-add in scratchpad */
+  /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
+  addi      x26, x30, 512
+  li        x2, 30
+  bn.addi   w30, w31, 1
+  bn.sid    x2++, 64(x26)
+  bn.sid    x2, 0(x26)
+  bn.sid    x2, 32(x26)
+  bn.sid    x2, 96(x26)
+  bn.sid    x2, 128(x26)
+  bn.sid    x2, 160(x26)
+
+  /* double-and-add loop with decreasing index */
+  loopi     384, 85
+
+    /* double point Q
+       Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
+    add       x27, x26, x0
+    jal       x1, proj_add_p384
+
+    /* store Q in dmem
+     dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
+    li        x2, 25
+    bn.sid    x2++, 0(x26)
+    bn.sid    x2++, 32(x26)
+    bn.sid    x2++, 64(x26)
+    bn.sid    x2++, 96(x26)
+    bn.sid    x2++, 128(x26)
+    bn.sid    x2++, 160(x26)
+
+    /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
+       is 1.
+       If only one MSb is set, select P for addition.
+       If both MSbs are set, select 2P for addition.
+       (If neither MSB is set, 2P will be selected but result discarded.) */
+    li        x2, 0
+    bn.lid    x2++, 224(x30)
+    bn.lid    x2, 480(x30)
+    bn.xor    w8, w0, w1
+    /* Create conditional offeset into scratchpad.
+       if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
+    csrrs     x3, 0x7c0, x0
+    xori      x3, x3, -1
+    andi      x3, x3, 2
+    slli      x27, x3, 7
+    add       x27, x27, x30
+
+    /* Reload randomized projective coodinates for curve point P.
+       P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
+    jal       x1, store_proj_randomize
+
+    /* Add points Q+P or Q+2P depending on offset in x27.
+       Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
+    jal       x1, proj_add_p384
+
+    /* load shares from scratchpad
+       [w1, w0] = s0; [w3, w2] = s1 */
+    li        x2, 0
+    bn.lid    x2++, 192(x30)
+    bn.lid    x2++, 224(x30)
+    bn.lid    x2++, 448(x30)
+    bn.lid    x2++, 480(x30)
+
+    /* M = s0[511] | s1[511] */
+    bn.or     w8, w1, w3
+
+    /* load q from scratchpad
+        Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
+    li        x2, 4
+    bn.lid    x2++, 0(x26)
+    bn.lid    x2++, 32(x26)
+    bn.lid    x2++, 64(x26)
+    bn.lid    x2++, 96(x26)
+    bn.lid    x2++, 128(x26)
+    bn.lid    x2++, 160(x26)
+
+    /* select either Q or Q_a
+       if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
+    bn.sel    w25, w25, w4, M
+    bn.sel    w26, w26, w5, M
+    bn.sel    w27, w27, w6, M
+    bn.sel    w28, w28, w7, M
+    bn.sel    w29, w29, w8, M
+    bn.sel    w30, w30, w9, M
+
+    /* store Q in dmem
+     dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
+    li        x2, 25
+    bn.sid    x2++, 0(x26)
+    bn.sid    x2++, 32(x26)
+    bn.sid    x2++, 64(x26)
+    bn.sid    x2++, 96(x26)
+    bn.sid    x2++, 128(x26)
+    bn.sid    x2++, 160(x26)
+
+    /* left shift both shares
+       s0 <= s0 << 1 ; s1 <= s1 << 1 */
+    bn.add    w0, w0, w0
+    bn.addc   w1, w1, w1
+    bn.add    w2, w2, w2
+    bn.addc   w3, w3, w3
+    /* store both shares in scratchpad */
+    li        x2, 0
+    bn.sid    x2++, 192(x30)
+    bn.sid    x2++, 224(x30)
+    bn.sid    x2++, 448(x30)
+    bn.sid    x2++, 480(x30)
+
+
+    /* Get a fresh random number and scale the coordinates of 2P.
+       (scaling each proj. coordinate by same factor results in same point) */
+
+    /* get a 384-bit random number */
+    bn.wsrr   w2, 1
+    bn.wsrr   w3, 1
+    bn.rshi   w3, w31, w3 >> 128
+
+    /* reduce random number
+      [w2, w3] = z <= [w2, w3] mod p */
+    bn.sub    w10, w2, w12
+    bn.subb   w11, w3, w13
+    bn.sel    w2, w2, w10, C
+    bn.sel    w3, w3, w11, C
+
+    /* scale all coordinates in scratchpad */
+    li        x2, 16
+    li        x3, 17
+    /* x-coordinate */
+    bn.mov    w10, w2
+    bn.mov    w11, w3
+    bn.lid    x2, 256(x30)
+    bn.lid    x3, 288(x30)
+    jal       x1, barrett384_p384
+    bn.sid    x2, 256(x30)
+    bn.sid    x3, 288(x30)
+    /* y-coordinate */
+    bn.mov    w10, w2
+    bn.mov    w11, w3
+    bn.lid    x2, 320(x30)
+    bn.lid    x3, 352(x30)
+    jal       x1, barrett384_p384
+    bn.sid    x2, 320(x30)
+    bn.sid    x3, 352(x30)
+    /* z-coordinate */
+    bn.mov    w10, w2
+    bn.mov    w11, w3
+    bn.lid    x2, 384(x30)
+    bn.lid    x3, 416(x30)
+    jal       x1, barrett384_p384
+    bn.sid    x2, 384(x30)
+    bn.sid    x3, 416(x30)
+
+  /* convert coordinates to affine space */
+  jal       x1, proj_to_affine_p384
+
+  ret
+
+
+/**
+ * Externally callable wrapper for P-384 scalar point multiplication
+ *
+ * returns R = k*P = k*(x_p, y_p)
+ *         where R, P are valid P-384 curve points in affine coordinates,
+ *               k is a 384-bit scalar..
+ *
+ * Sets up context and calls internal scalar multiplication routine.
+ * This routine runs in constant time.
+ *
+ * @param[in]  dmem[0]: dK, pointer to location in dmem containing scalar k
+ * @param[in]  dmem[4]: dRnd, pointer to location in dmem containing random
+ *                        number for blinding
+ * @param[in]  dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
+ * @param[in]  dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
+ *
+ * 384-bit quantities have to be provided in dmem in little-endian format,
+ * 512 bit aligned, with the highest 128 bit set to zero.
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ *        the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
+ *                      w0 to w30
+ * clobbered flag groups: FG0
+ */
+.globl scalar_mult_p384
+scalar_mult_p384:
+
+  /* set dmem pointer to point x-coordinate */
+  la        x20, dptr_x
+  lw        x20, 0(x20)
+
+  /* set dmem pointer to point y-coordinate */
+  la        x21, dptr_y
+  lw        x21, 0(x21)
+
+  /* set dmem pointer to scalar k */
+  la        x19, dptr_k
+  lw        x19, 0(x19)
+
+  /* set pointer to blinding parameter */
+  la        x9, dptr_rnd
+  lw        x9, 0(x9)
+
+  /* set dmem pointer to domain parameter b */
+  la        x28, p384_b
+
+  /* set dmem pointer to scratchpad */
+  la        x30, scratchpad
+
+  /* load domain parameter p (modulus)
+     [w13, w12] = p = dmem[p384_p] */
+  li        x2, 12
+  la        x3, p384_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load Barrett constant u for modulus p
+     [w15, w14] = u_p = dmem[p384_u_p] */
+  li        x2, 14
+  la        x3, p384_u_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load domain parameter n (order of base point)
+     [w11, w10] = n = dmem[p384_n] */
+  li        x2, 10
+  la        x3, p384_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* init all-zero reg */
+  bn.xor    w31, w31, w31
+
+  jal       x1, scalar_mult_int_p384
+
+  ret
+
+
+/**
+ * Variable-time modular multiplicative inverse computation
+ *
+ * returns x_inv = x^-1 mod m
+ *
+ * This routine computes the modular multiplicative inverse for any x < m in
+ * the finite field GF(m) where m is prime.
+ *
+ * For inverse computation, Fermat's little theorem is used, i.e.
+ * we compute x^-1 = x^(m-2) mod m.
+ * For exponentiation we use a standard, variable-time (!) square and multiply
+ * algorithm.
+ *
+ * This routine is mainly intended to be used for inversion of scalars in
+ * context of the P-384 curve. In theory, it can be used with any 384-bit
+ * modulus m with a corresponding 385-bit Barrett constant u,
+ * where u[383:192] = 0.
+ *
+ * Note: When used for P-384 scalar inversion, the routine will need 672 calls
+ * to the multiplication routine. By using an adder chain this could be reduced
+ * to ~433 multiplications, however, at the cost of a significant codes size
+ * increase.
+ *
+ * Note: This routine runs in variable-time w.r.t. the modulus. It should only
+ * be used with a non-secret modulus.
+ *
+ * @param[in]  [w13, w12]: m, 384 bit modulus
+ * @param[in]  [w15, w14]: u[383:0], lower 384 bit of pre-computed Barrett
+ *                          constant corresponding to modulus m
+ * @param[in]  [w30, w29]: x, 384 bit operand
+ * @param[in]  w31, all-zero
+ * @param[out] [w17, w16]: x_inv, modular multiplicative inverse
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2, w2, w3, w10, w11, w16 to w24
+ * clobbered flag groups: FG0
+ */
+mod_inv_n_p384:
+
+  /* subtract 2 from modulus for Fermat's little theorem
+     [w13,w12] <= m - 2 = [w11,w10]-2 (left aligned) */
+  bn.subi   w2, w12, 2
+  bn.subb   w3, w13, w31
+  bn.rshi   w3, w3, w2 >> 128
+  bn.rshi   w2, w2, w31 >> 128
+
+  /* init square and multiply: [w17,w16] = 1 */
+  bn.addi   w16, w31, 1
+  bn.mov    w17, w31
+
+  /* square and multiply loop */
+  loopi     384, 12
+
+    /* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, barrett384_p384
+
+    /* shift MSB into carry flag
+       [w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */
+    bn.add    w2, w2, w2
+    bn.addc   w3, w3, w3
+
+    /* skip multiplication if C flag not set */
+    csrrs     x2, 1984, x0
+    andi      x2, x2, 1
+    beq       x2, x0, nomul
+
+    /* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */
+    bn.mov    w10, w29
+    bn.mov    w11, w30
+    jal       x1, barrett384_p384
+
+    nomul:
+    nop
+
+  ret
+
+
+/**
+ * P-384 ECDSA signature generation
+ *
+ * returns the signature as the pair r, s with
+ *         r = x_1  mod n
+ *     and s = k^(-1)(msg + r*d)  mod n
+ *         where x_1 is the affine x-coordinate of the curve point k*G,
+ *               G is the curve's base point,
+ *               k is a supplied secret random number,
+ *               n is the order of the base point G of P-256,
+ *               msg is the message to be signed, and
+ *               d is the private key.
+ *
+ * This routine runs in constant time.
+ *
+ * @param[in]  dmem[0]: dptr_k, pointer to a 384 bit random secret in dmem
+ * @param[in]  dmem[4]: dptr_rnd, pointer to location in dmem containing
+ *                       a 384-bit random number for blinding
+ * @param[in]  dmem[8]: dptr_msg, pointer to the message to be signed in dmem
+ * @param[in]  dmem[12]: dptr_r, pointer to dmem location where s component
+ *                               of signature will be placed
+ * @param[in]  dmem[16]: dptr_s, pointer to dmem location where r component
+ *                               of signature will be placed
+ * @param[in]  dmem[28]: dptr_d, pointer to private key d in dmem
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2, x3, x9 to x13, x18 to x28, x30
+ *                      w0 to w31
+ * clobbered flag groups: FG0
+ */
+.globl p384_sign
+p384_sign:
+  /* init all-zero reg */
+  bn.xor    w31, w31, w31
+
+  /* set dmem pointer to domain parameter b */
+  la        x28, p384_b
+
+  /* set dmem pointer to basepoint x-coordinate */
+  la        x20, p384_gx
+
+  /* set dmem pointer to basepoint y-coordinate */
+  la        x21, p384_gy
+
+  /* set dmem pointer to secret random scalar k */
+  la        x19, dptr_k
+  lw        x19, 0(x19)
+
+  /* set pointer to blinding parameter */
+  la        x9, dptr_rnd
+  lw        x9, 0(x9)
+
+  /* set dmem pointer to scratchpad */
+  la        x30, scratchpad
+
+  /* load domain parameter p (modulus)
+     [w13, w12] <= p = dmem[dptr_p] */
+  li        x2, 12
+  la        x3, p384_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load Barrett constant u for modulus p
+     [w15, w14] = u_p = dmem[p384_u_p] */
+  li        x2, 14
+  la        x3, p384_u_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load domain parameter n (order of base point)
+     [w11, w10] = n = dmem[p384_n] */
+  li        x2, 10
+  la        x3, p384_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* scalar multiplication with base point
+     [w28:w25] <= (x_1, y_1) = k*G */
+  jal       x1, scalar_mult_int_p384
+
+  /* store r of signature in dmem: dmem[dptr_r] <= r = [w26,w25] */
+  li        x2, 25
+  la        x3, dptr_r
+  lw        x3, 0(x3)
+  bn.sid    x2++, 0(x3)
+  bn.sid    x2++, 32(x3)
+
+  /* load secret random number k from dmem
+     [w30,w29] <= k = dmem[dptr_k] */
+  li        x2, 29
+  bn.lid    x2++, 0(x19)
+  bn.lid    x2++, 32(x19)
+
+  /* load domain parameter n (order of base point)
+     [w13, w12] <= p = dmem[p384_n] */
+  li        x2, 12
+  la        x3, p384_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load Barrett constant u_n for modulus n for scalar operations
+     [w15, w14] <= u_m = dmem[p384_u_n] */
+  li        x2, 14
+  la        x3, p384_u_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* modular multiplicative inverse of k
+     [w3, w2] <= [w17, w16] <= k^(-1) mod n */
+  jal       x1, mod_inv_n_p384
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* load private key d from dmem
+     [w11,w10] <= d = dmem[dptr_d] */
+  li        x2, 10
+  la        x3, dptr_d
+  lw        x3, 0(x3)
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* [w17, w16] <= k^(-1)*d mod n = [w17, w16] * [w11, w10] mod [w13, w12] */
+  jal       x1, barrett384_p384
+
+  /*  [w5, w4] <= [w17, w16]
+        <= r * (k^(-1)*d) mod n = [w26, w25] * [w17, w16] mod [w13, w12] */
+  bn.mov    w10, w25
+  bn.mov    w11, w26
+  jal       x1, barrett384_p384
+  bn.mov    w4, w16
+  bn.mov    w5, w17
+
+  /* load message from dmem
+     [w11, w10] <= msg = dmem[dptr_msg] */
+  li        x2, 10
+  la        x3, dptr_msg
+  lw        x3, 0(x3)
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* [w17, w16] <= k^(-1) * msg = [w3, w2]*[w17, w16] mod n */
+  bn.mov    w16, w2
+  bn.mov    w17, w3
+  jal       x1, barrett384_p384
+
+  /* [w28, w27] <= s' = k^(-1)*msg + k^(-1)*r*d  = [w17, w16] + [w5, w4]*/
+  bn.add    w27, w16, w4
+  bn.addc   w28, w17, w5
+
+  /* reduce s: [w28, w27] <= s <= s' mod n = [w28, w27] mod [w13, w12] */
+  bn.sub    w10, w27, w12
+  bn.subb   w11, w28, w13
+  bn.sel    w27, w27, w10, C
+  bn.sel    w28, w28, w11, C
+
+  /* store s of signature in dmem: dmem[dptr_s] <= s = [w28, w27] */
+  li        x2, 27
+  la        x3, dptr_s
+  lw        x3, 0(x3)
+  bn.sid    x2++, 0(x3)
+  bn.sid    x2++, 32(x3)
+
+  ret
+
+
+/* pointers and scratchpad memory */
+.section .data
+
+/* pointer to k (dptr_k) */
+.globl dptr_k
+dptr_k:
+  .zero 4
+
+/* pointer to rnd (dptr_rnd) */
+.globl dptr_rnd
+dptr_rnd:
+  .zero 4
+
+/* pointer to msg (dptr_msg) */
+.globl dptr_msg
+dptr_msg:
+  .zero 4
+
+/* pointer to R (dptr_r) */
+.globl dptr_r
+dptr_r:
+  .zero 4
+
+/* pointer to S (dptr_s) */
+.globl dptr_s
+dptr_s:
+  .zero 4
+
+/* pointer to X (dptr_x) */
+.globl dptr_x
+dptr_x:
+  .zero 4
+
+/* pointer to Y (dptr_y) */
+.globl dptr_y
+dptr_y:
+  .zero 4
+
+/* pointer to D (dptr_d) */
+.globl dptr_d
+dptr_d:
+  .zero 4
+
+/* 704 bytes of scratchpad memory */
+scratchpad:
+  .zero 704
diff --git a/sw/otbn/code-snippets/rules.mk b/sw/otbn/code-snippets/rules.mk
index 280584c..646461f 100644
--- a/sw/otbn/code-snippets/rules.mk
+++ b/sw/otbn/code-snippets/rules.mk
@@ -151,18 +151,26 @@
 
 # p384_proj_add_test depends on p384_proj_add defined in p384.s
 $(otbn-code-snippets-bin-dir)/p384_proj_add_test.elf: \
-  $(otbn-code-snippets-obj-dir)/p384.o
+  $(otbn-code-snippets-obj-dir)/p384_base.o
 $(otbn-code-snippets-bin-dir)/p384_proj_add_test.elf: \
-  otbn-libs += $(otbn-code-snippets-obj-dir)/p384.o
+  otbn-libs += $(otbn-code-snippets-obj-dir)/p384_base.o
 
-# p384_scalar_mult_test depends on p384_scalar_mult_int defined in p384.s
-$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
-  $(otbn-code-snippets-obj-dir)/p384.o
-$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
-  otbn-libs += $(otbn-code-snippets-obj-dir)/p384.o
+# code in p384_sign depends on code defined in p384_base.s
+$(otbn-code-snippets-bin-dir)/p384_sign.elf: \
+  $(otbn-code-snippets-obj-dir)/p384_base.o
+$(otbn-code-snippets-bin-dir)/p384_sign.elf: \
+  otbn-libs += $(otbn-code-snippets-obj-dir)/p384_base.o
 
-# p384_ecdsa_sign_test depends on p384_sign defined in p384.s
+# p384_scalar_mult_test depends on scalar_mult_p384 defined in p384_sign.s
+$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
+  $(otbn-code-snippets-obj-dir)/p384_sign.o
+$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
+  otbn-libs += $(otbn-code-snippets-obj-dir)/p384_sign.o \
+  $(otbn-code-snippets-obj-dir)/p384_base.o
+
+# p384_ecdsa_sign_test depends on p384_sign defined in p384_sign.s
 $(otbn-code-snippets-bin-dir)/p384_ecdsa_sign_test.elf: \
-  $(otbn-code-snippets-obj-dir)/p384.o
+  $(otbn-code-snippets-obj-dir)/p384_sign.o
 $(otbn-code-snippets-bin-dir)/p384_ecdsa_sign_test.elf: \
-  otbn-libs += $(otbn-code-snippets-obj-dir)/p384.o
+  otbn-libs += $(otbn-code-snippets-obj-dir)/p384_sign.o \
+  $(otbn-code-snippets-obj-dir)/p384_base.o