[otbn] split P-384 lib
This splits the P-384 lib into two assembly files:
- a base lib containing the domain parameters and
routines for point addition
- a sign lib containing routines for signature generation
and constant-time scalar point multiplication building up
on the base lib.
This is in preparation having independent P-384 binaries,
one for signature generation and one for signature verification.
Signed-off-by: Felix Miller <felix.miller@gi-de.com>
diff --git a/sw/otbn/code-snippets/p384.s b/sw/otbn/code-snippets/p384.s
deleted file mode 100644
index 06a66c4..0000000
--- a/sw/otbn/code-snippets/p384.s
+++ /dev/null
@@ -1,1827 +0,0 @@
-/* Copyright lowRISC contributors. */
-/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
-/* SPDX-License-Identifier: Apache-2.0 */
-/*
- * P-384 specific routines
- */
-
- .section .text
-
-
-/**
- * 384-bit modular multiplication based on Barrett reduction algorithm
- * optimized for the special modulus of the NIST P-384 curve.
- *
- * Returns c = a x b % p.
- *
- * Expects: two operands, modulus p and pre-calculated parameter u for barrett
- * reduction (usually greek mu in literature). u is expected without the
- * leading 1 at bit 384. u has to be pre-calculated as u = floor(2^768/p).
- *
- * This implementation mostly follows the description in the
- * "Handbook of Applied Cryptography" in Algorithm 14.42.
- * Differences:
- * - This implementation incorporates a multiplication before the reduction.
- * Therefore it expects two operands (a, b) instead of a wider integer x.
- * - The computation of q2 ignores the MSbs of q1 and u to allow using
- * a 384x384 bit multiplication. This is compensated later by
- * individual (conditional) additions.
- * - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
- * are not implemented here and the full register width is used. This
- * allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in] [w11, w10]: a, first operand, max. length 384 bit, a < m.
- * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
- * @param[in] [w13, w12]: p, modulus of P384 i.e.:
- m = 2^384 - 2^128 - 2^96 + 2^32 - 1.
- * @param[in] [w15, w14]: u, pre-computed Barrett constant (without u[384]/MSb
- * of u which is always 1 for the allowed range but
- * has to be set to 0 here).
- * @param[in] w31: all-zero.
- * @param[out] [w17, w16]: c, result, max. length 384 bit.
- *
- * Clobbered registers: w10, w11, w16, w17, w18, w19, w20, w21, w22, w23, w24
- * Clobbered flag groups: FG0
- */
- .globl barrett384_p384
-barrett384_p384:
- /* Compute the integer product of the operands x = a * b
- x = [w18, w22, w21] = a * b = [w11, w10] * [w17, w16]
- => max. length x: 768 bit */
- bn.mulqacc.z w10.0, w16.0, 0
- bn.mulqacc w10.0, w16.1, 64
- bn.mulqacc.so w21.L, w10.1, w16.0, 64
- bn.mulqacc w10.0, w16.2, 0
- bn.mulqacc w10.1, w16.1, 0
- bn.mulqacc w10.2, w16.0, 0
- bn.mulqacc w10.0, w16.3, 64
- bn.mulqacc w10.1, w16.2, 64
- bn.mulqacc w10.2, w16.1, 64
- bn.mulqacc.so w21.U, w10.3, w16.0, 64
- bn.mulqacc w10.0, w17.0, 0
- bn.mulqacc w10.1, w16.3, 0
- bn.mulqacc w10.2, w16.2, 0
- bn.mulqacc w10.3, w16.1, 0
- bn.mulqacc w11.0, w16.0, 0
- bn.mulqacc w10.0, w17.1, 64
- bn.mulqacc w10.1, w17.0, 64
- bn.mulqacc w10.2, w16.3, 64
- bn.mulqacc w10.3, w16.2, 64
- bn.mulqacc w11.0, w16.1, 64
- bn.mulqacc.so w22.L, w11.1, w16.0, 64
- bn.mulqacc w10.1, w17.1, 0
- bn.mulqacc w10.2, w17.0, 0
- bn.mulqacc w10.3, w16.3, 0
- bn.mulqacc w11.0, w16.2, 0
- bn.mulqacc w11.1, w16.1, 0
- bn.mulqacc w10.2, w17.1, 64
- bn.mulqacc w10.3, w17.0, 64
- bn.mulqacc w11.0, w16.3, 64
- bn.mulqacc.so w22.U, w11.1, w16.2, 64
- bn.mulqacc w10.3, w17.1, 0
- bn.mulqacc w11.0, w17.0, 0
- bn.mulqacc w11.1, w16.3, 0
- bn.mulqacc w11.0, w17.1, 64
- bn.mulqacc.so w18.L, w11.1, w17.0, 64
- bn.mulqacc.so w18.U, w11.1, w17.1, 0
-
- /* Store correction factor to compensate for later neglected MSb of x.
- x is 768 bit wide and therefore the 383 bit right shifted version q1
- (below) contains 385 bit. Bit 384 of q1 is neglected to allow using a
- 384x384 multiplier. For the MSb of x being set we temporary store u
- (or zero) here to be used in a later constant time correction of a
- multiplication with u. Note that this requires the MSb flag being carried
- over from the multiplication routine. */
- bn.sel w23, w14, w31, M
- bn.sel w24, w15, w31, M
-
- /* Compute q1 = x >> 383
- q1 = [w11, w10] = [w18, w22, w21] >> 383 = [w18, w21] >> 127
- => max length q1: 385 bits */
- bn.rshi w11, w31, w18 >> 127
- bn.rshi w10, w18, w22 >> 127
-
- /* Compute q2 = q1*u
- Instead of full q2 (which would be up to 770 bits) we ignore the MSb of u
- and the MSb of q1 and correct this later. This allows using a 384x384
- multiplier. We use the property that u for the modulus of P384 is zero in
- the bits 383 downto 129 and use a 384x192 multiplication routine.
- => max. length q2': 513 bit
- q2' = q1[383:0]*u[128:0] = [w18, w17, w16] = [w11, w10] * [w15, w14] */
-
- /* 576 = 384*192 bit multiplication kernel */
- bn.mulqacc.z w10.0, w14.0, 0
- bn.mulqacc w10.0, w14.1, 64
- bn.mulqacc.so w16.L, w10.1, w14.0, 64
- bn.mulqacc w10.0, w14.2, 0
- bn.mulqacc w10.1, w14.1, 0
- bn.mulqacc w10.2, w14.0, 0
- bn.mulqacc w10.1, w14.2, 64
- bn.mulqacc w10.2, w14.1, 64
- bn.mulqacc.so w16.U, w10.3, w14.0, 64
- bn.mulqacc w10.2, w14.2, 0
- bn.mulqacc w10.3, w14.1, 0
- bn.mulqacc w11.0, w14.0, 0
- bn.mulqacc w10.3, w14.2, 64
- bn.mulqacc w11.0, w14.1, 64
- bn.mulqacc.so w17.L, w11.1, w14.0, 64
- bn.mulqacc w11.0, w14.2, 0
- bn.mulqacc w11.1, w14.1, 0
- bn.mulqacc.so w17.U, w11.1, w14.2, 64
-
- /* w14.3 is always zero here due to structure of Barrett constant */
- bn.mulqacc.wo w18, w11.1, w14.3, 64
-
- /* q3 = q2 >> 385
- In this step, the compensation for the neglected MSbs of q1 and u is
- carried out underway. To add them in the q2 domain, they would have to be
- left shifted by 384 bit first. To directly add them we first shift q2' by
- 384 bit to the right, perform the additions, and shift the result another
- bit to the right. The additions cannot overflow due to leading zeros
- after shift.
- q2'' = q2' >> 384 = [w20, w19] = [w18, w17, w16] >> 384
- = [w18, w17] >> 128 */
- bn.rshi w20, w31, w18 >> 128
- bn.rshi w19, w18, w17 >> 128
- /* Add q1. This is unconditional since MSb of u is always 1.
- This cannot overflow due to leading zeros.
- q2''' = q2'' + q1 = [w20, w19] = [w20, w19] + [w10, w11] */
- bn.add w19, w19, w10
- bn.addc w20, w20, w11
- /* Conditionally add u (without leading 1) in case of MSb of x being set.
- This is the "real" q2 but shifted by 384 bits to the right. This cannot
- overflow due to leading zeros
- q2'''' = x[767]?q2'''+u[383:0]:q2'''
- = [w20, w19] + [w24, w23] = q2 >> 384 */
- bn.add w19, w19, w23
- bn.addc w20, w20, w24
- /* finally this gives q3 by shifting the remaining bit to the right
- q3 = q2 >> 385 = q2'''' >> 1 = [w11, w10] = [w20, w19] >> 1 */
- bn.rshi w11, w31, w20 >> 1
- bn.rshi w10, w20, w19 >> 1
-
- /* r2 = q3*m[511:0] = [w17, w16] = ([w11, w10] * [w13, w12])[511:0]
- A 384x384 bit multiplication kernel is used here, hence both q3 or p
- must not be wider than 384 bit. This is always the case for p. For q3 it
- is the case if a<p and b<p.
- The 256 highest bits of the multiplication result are not needed,
- so we do not compute them. */
- bn.mulqacc.z w10.0, w12.0, 0
- bn.mulqacc w10.0, w12.1, 64
- bn.mulqacc.so w16.L, w10.1, w12.0, 64
- bn.mulqacc w10.0, w12.2, 0
- bn.mulqacc w10.1, w12.1, 0
- bn.mulqacc w10.2, w12.0, 0
- bn.mulqacc w10.0, w12.3, 64
- bn.mulqacc w10.1, w12.2, 64
- bn.mulqacc w10.2, w12.1, 64
- bn.mulqacc.so w16.U, w10.3, w12.0, 64
- bn.mulqacc w10.0, w13.0, 0
- bn.mulqacc w10.1, w12.3, 0
- bn.mulqacc w10.2, w12.2, 0
- bn.mulqacc w10.3, w12.1, 0
- bn.mulqacc w11.0, w12.0, 0
- bn.mulqacc w10.0, w13.1, 64
- bn.mulqacc w10.1, w13.0, 64
- bn.mulqacc w10.2, w12.3, 64
- bn.mulqacc w10.3, w12.2, 64
- bn.mulqacc w11.0, w12.1, 64
- bn.mulqacc.so w17.L, w11.1, w12.0, 64
- bn.mulqacc w10.1, w13.1, 0
- bn.mulqacc w10.2, w13.0, 0
- bn.mulqacc w10.3, w12.3, 0
- bn.mulqacc w11.0, w12.2, 0
- bn.mulqacc w11.1, w12.1, 0
- bn.mulqacc w10.2, w13.1, 64
- bn.mulqacc w10.3, w13.0, 64
- bn.mulqacc w11.0, w12.3, 64
- bn.mulqacc.so w17.U, w11.1, w12.2, 64
-
- /* Compute r = x-r2 = x-q3*p
- since 0 <= r < 3*p, we only need to consider the lower limbs of x and r2
- r[511:0] = [w22, w21] - [w17, w16] */
- bn.sub w21, w21, w16
- bn.subb w22, w22, w17
-
- /* Barrett algorithm requires subtraction of the modulus at most two times if
- result is too large. However in the special case of P-384 we need to
- subtract only once */
- bn.sub w16, w21, w12
- bn.subb w17, w22, w13
- bn.sel w16, w21, w16, C
- bn.sel w17, w22, w17, C
-
- /* return result: c =[w17, w16] = a * b % p. */
- ret
-
-
-/**
- * P-384 point addition in projective space
- *
- * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q)
- * with R, P and Q being valid P-384 curve points
- * in projective coordinates
- *
- * This routine adds two valid P-384 curve points in projective space.
- * Point addition is performed based on the complete formulas of Bosma and
- * Lenstra for Weierstrass curves as first published in [1] and
- * optimized in [2].
- * The implemented version follows Algorithm 4 of [2] which is an optimized
- * variant for Weierstrass curves with domain parameter 'a' set to a=-3.
- * Numbering of the steps below and naming of symbols follows the
- * terminology of Algorithm 4 of [2].
- * The routine is limited to P-384 curve points due to:
- * - fixed a=-3 domain parameter
- * - usage of a P-384 optimized Barrett multiplication kernel
- * This routine runs in constant time.
- *
- * [1] https://doi.org/10.1006/jnth.1995.1088
- * [2] https://doi.org/10.1007/978-3-662-49890-3_16
- *
- * @param[in] x22: set to 10, pointer to in reg for Barrett routine
- * @param[in] x23: set to 11, pointer to in reg for Barrett routine
- * @param[in] x24: set to 16, pointer to in/out reg for Barrett routine
- * @param[in] x25: set to 17, pointer to in/out reg for Barrett routine
- * @param[in] x26: dptr_p_p, dmem pointer to point P in dmem (projective)
- * @param[in] x27: dptr_q_p, dmem pointer to point Q in dmem (projective)
- * @param[in] x28: dptr_b, dmem pointer to domain parameter b of P-384 in dmem
- * @param[in] [w13, w12]: p, modulus of underlying field of P-384
- * @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
- * modulus p
- * @param[in] w31: all-zero.
- * @param[out] [w26, w25]: x_r, x-coordinate of resulting point R
- * @param[out] [w28, w27]: y_r, y-coordinate of resulting point R
- * @param[out] [w30, w29]: z_r, z-coordinate of resulting point R
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * clobbered registers: w0 to w30
- * clobbered flag groups: FG0
- */
-.globl proj_add_p384
-proj_add_p384:
- /* mapping of parameters to symbols of [2] (Algorithm 4):
- X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q
- X3 = x_r; Y3 = y_r; Z3 = z_r */
-
- /* 1: [w1, w0] = t0 <= X1*X2 = dmem[x26+0]*dmem[x27+0] */
- bn.lid x22, 0(x26)
- bn.lid x23, 32(x26)
- bn.lid x24, 0(x27)
- bn.lid x25, 32(x27)
- jal x1, barrett384_p384
- bn.mov w0, w16
- bn.mov w1, w17
-
- /* 2: [w3, w2] = t1 <= Y1*Y2 = dmem[x26+64]*dmem[x27+64] */
- bn.lid x22, 64(x26)
- bn.lid x23, 96(x26)
- bn.lid x24, 64(x27)
- bn.lid x25, 96(x27)
- jal x1, barrett384_p384
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* 3: [w5, w4] = t2 <= Z1*Z2 = dmem[x26+128]*dmem[x27+128] */
- bn.lid x22, 128(x26)
- bn.lid x23, 160(x26)
- bn.lid x24, 128(x27)
- bn.lid x25, 160(x27)
- jal x1, barrett384_p384
- bn.mov w4, w16
- bn.mov w5, w17
-
- /* 4: [w7, w6] = t3 <= X1+Y1 = dmem[x26+0]+dmem[x26+64] */
- bn.lid x22, 0(x26)
- bn.lid x23, 32(x26)
- bn.lid x24, 64(x26)
- bn.lid x25, 96(x26)
- bn.add w16, w10, w16
- bn.addc w17, w11, w17
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w6, w16
- bn.mov w7, w17
-
- /* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */
- bn.lid x22, 0(x27)
- bn.lid x23, 32(x27)
- bn.lid x24, 64(x27)
- bn.lid x25, 96(x27)
- bn.add w16, w10, w16
- bn.addc w17, w11, w17
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w8, w16
- bn.mov w9, w17
-
- /* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */
- bn.mov w10, w6
- bn.mov w11, w7
- bn.mov w16, w8
- bn.mov w17, w9
- jal x1, barrett384_p384
- bn.mov w6, w16
- bn.mov w7, w17
-
- /* 7: [w9, w8] = t4 <= t0+t1 = [w1, w0]+[w3, w2] */
- bn.add w16, w0, w2
- bn.addc w17, w1, w3
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w8, w16
- bn.mov w9, w17
-
- /* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */
- bn.sub w16, w6, w8
- bn.subb w17, w7, w9
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w6, w16
- bn.mov w7, w17
-
- /* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */
- bn.lid x22, 64(x26)
- bn.lid x23, 96(x26)
- bn.lid x24, 128(x26)
- bn.lid x25, 160(x26)
- bn.add w16, w10, w16
- bn.addc w17, w11, w17
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w8, w16
- bn.mov w9, w17
-
- /* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */
- bn.lid x22, 64(x27)
- bn.lid x23, 96(x27)
- bn.lid x24, 128(x27)
- bn.lid x25, 160(x27)
- bn.add w16, w10, w16
- bn.addc w17, w11, w17
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */
- bn.mov w10, w8
- bn.mov w11, w9
- bn.mov w16, w25
- bn.mov w17, w26
- jal x1, barrett384_p384
- bn.mov w8, w16
- bn.mov w9, w17
-
- /* 12: [w26, w25] = X3 <= t1+t2 = [w3, w2]+[w5, w4] */
- bn.add w16, w2, w4
- bn.addc w17, w3, w5
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */
- bn.sub w16, w8, w25
- bn.subb w17, w9, w26
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w8, w16
- bn.mov w9, w17
-
- /* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */
- bn.lid x22, 0(x26)
- bn.lid x23, 32(x26)
- bn.lid x24, 128(x26)
- bn.lid x25, 160(x26)
- bn.add w16, w10, w16
- bn.addc w17, w11, w17
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */
- bn.lid x22, 0(x27)
- bn.lid x23, 32(x27)
- bn.lid x24, 128(x27)
- bn.lid x25, 160(x27)
- bn.add w16, w10, w16
- bn.addc w17, w11, w17
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */
- bn.mov w10, w25
- bn.mov w11, w26
- bn.mov w16, w27
- bn.mov w17, w28
- jal x1, barrett384_p384
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 17: [w28, w27] = Y3 <= t0+t2 = [w1, w0]+[w5, w4] */
- bn.add w16, w0, w4
- bn.addc w17, w1, w5
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */
- bn.sub w16, w25, w27
- bn.subb w17, w26, w28
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */
- bn.lid x22, 0(x28)
- bn.lid x23, 32(x28)
- bn.mov w16, w4
- bn.mov w17, w5
- jal x1, barrett384_p384
- bn.mov w29, w16
- bn.mov w30, w17
-
- /* 20: [w26, w25] = X3 <= Y3-Z3 = [w28, w27]-[w30, w29] */
- bn.sub w16, w27, w29
- bn.subb w17, w28, w30
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */
- bn.add w16, w25, w25
- bn.addc w17, w26, w26
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w29, w16
- bn.mov w30, w17
-
- /* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */
- bn.add w16, w25, w29
- bn.addc w17, w26, w30
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */
- bn.sub w16, w2, w25
- bn.subb w17, w3, w26
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w29, w16
- bn.mov w30, w17
-
- /* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */
- bn.add w16, w2, w25
- bn.addc w17, w3, w26
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */
- bn.lid x22, 0(x28)
- bn.lid x23, 32(x28)
- bn.mov w16, w27
- bn.mov w17, w28
- jal x1, barrett384_p384
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 26: [w3, w2] = t1 <= t2+t2 = [w5, w4]+[w5, w4] */
- bn.add w16, w4, w4
- bn.addc w17, w5, w5
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */
- bn.add w16, w2, w4
- bn.addc w17, w3, w5
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w4, w16
- bn.mov w5, w17
-
- /* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */
- bn.sub w16, w27, w4
- bn.subb w17, w28, w5
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */
- bn.sub w16, w27, w0
- bn.subb w17, w28, w1
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */
- bn.add w16, w27, w27
- bn.addc w17, w28, w28
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */
- bn.add w16, w2, w27
- bn.addc w17, w3, w28
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */
- bn.add w16, w0, w0
- bn.addc w17, w1, w1
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */
- bn.add w16, w2, w0
- bn.addc w17, w3, w1
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w0, w16
- bn.mov w1, w17
-
- /* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */
- bn.sub w16, w0, w4
- bn.subb w17, w1, w5
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w0, w16
- bn.mov w1, w17
-
- /* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */
- bn.mov w10, w8
- bn.mov w11, w9
- bn.mov w16, w27
- bn.mov w17, w28
- jal x1, barrett384_p384
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* 36: [w5, w4] = t2 <= t0*Y3 = [w1, w0]*[w28, w27] */
- bn.mov w10, w0
- bn.mov w11, w1
- bn.mov w16, w27
- bn.mov w17, w28
- jal x1, barrett384_p384
- bn.mov w4, w16
- bn.mov w5, w17
-
- /* 37: [w28, w27] = Y3 <= X3*Z3 = [w26, w25]*[w30, w29] */
- bn.mov w10, w25
- bn.mov w11, w26
- bn.mov w16, w29
- bn.mov w17, w30
- jal x1, barrett384_p384
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 38: [w28, w27] = Y3 <= Y3+t2 = [w28, w27]+[w5, w4] */
- bn.add w16, w27, w4
- bn.addc w17, w28, w5
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w27, w16
- bn.mov w28, w17
-
- /* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */
- bn.mov w10, w6
- bn.mov w11, w7
- bn.mov w16, w25
- bn.mov w17, w26
- jal x1, barrett384_p384
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 40: [w26, w25] = X3 <= X3-t1 = [w26, w25]-[w3, w2] */
- bn.sub w16, w25, w2
- bn.subb w17, w26, w3
- bn.add w10, w16, w12
- bn.addc w11, w17, w13
- bn.sel w16, w10, w16, C
- bn.sel w17, w11, w17, C
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */
- bn.mov w10, w8
- bn.mov w11, w9
- bn.mov w16, w29
- bn.mov w17, w30
- jal x1, barrett384_p384
- bn.mov w29, w16
- bn.mov w30, w17
-
- /* 42: [w3, w2] = t1 <= t3*t0 = [w7, w6]*[w1, w0] */
- bn.mov w10, w6
- bn.mov w11, w7
- bn.mov w16, w0
- bn.mov w17, w1
- jal x1, barrett384_p384
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* 43: [w30, w29] = Z3 <= Z3+t1 = [w30, w29]+[w3, w2] */
- bn.add w16, w29, w2
- bn.addc w17, w30, w3
- bn.sub w10, w16, w12
- bn.subb w11, w17, w13
- bn.sel w16, w16, w10, C
- bn.sel w17, w17, w11, C
- bn.mov w29, w16
- bn.mov w30, w17
-
- ret
-
-
-/**
- * Convert projective coordinates of a P-384 curve point to affine coordinates
- *
- * returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
- * where P is a valid P-384 curve point,
- * x_a and y_a are the resulting affine coordinates of the
- * curve point,
- * x,y and z are a set of projective coordinates of the
- * point and
- * p is the modulus of the P-384 underlying finite field.
- *
- * This routine computes the affine coordinates for a set of projective
- * coordinates of a valid P-384 curve point. The routine performs the required
- * divisions by computing the multiplicative modular inverse of the
- * projective z-coordinate in the underlying finite field of the P-384 curve.
- * For inverse computation Fermat's little theorem is used, i.e.
- * we compute z^-1 = z^(p-2) mod p.
- * For exponentiation a 16 step addition chain is used.
- * Source of the addition chain is the addchain project:
- * https://github.com/mmcloughlin/addchain/
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in] [w26,w25]: x, x-coordinate of curve point (projective).
- * @param[in] [w26,w25]: y, y-coordinate of curve point (projective).
- * @param[in] [w30,w29]: z, z-coordinate of curve point (projective).
- * @param[in] [w13, w12]: p, modulus of P-384.
- * @param[in] [w15, w14]: u, pre-computed Barrett constant for p,
- * lower 384 bits, i.e. (2^(2*384) div p)[383:0].
- * @param[in] w31: all-zero.
- * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
- * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
- *
- * clobbered registers: w0 to w28
- * clobbered flag groups: FG0
- */
-proj_to_affine_p384:
-
- /* Exp: 0b10 = 2*0b1
- Val: r10 = z^2 mod p
- [w17,w16] <= [w30,w29]^2 mod [w13,w12] */
- bn.mov w10, w29
- bn.mov w11, w30
- bn.mov w16, w29
- bn.mov w17, w30
- jal x1, barrett384_p384
-
- /* Exp: 0b11 = 0b1+0b10
- Val: r11 <= z*r10 mod p
- [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
- bn.mov w10, w29
- bn.mov w11, w30
- jal x1, barrett384_p384
-
- /* Exp: 0b110 = 2*0b11
- Val: r110 = r11^2 mod p
- [w17,w16] <= [w17,w16]^2 mod [w13,w12] */
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
-
- /* Exp: 0b111 = 0b1+0b110
- Val: r111 <= z*r110 mod p
- [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
- bn.mov w10, w29
- bn.mov w11, w30
- jal x1, barrett384_p384
- bn.mov w0, w16
- bn.mov w1, w17
-
- /* Exp: 0b111000 = 0b111<<3
- Val: r111000 <= r111^(2^3) mod p
- [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
- loopi 3, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
-
- /* Exp: 0b1111111 = 0b111+0b111000
- Val: r1111111 <= r111*r111000 mod p
- [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
- bn.mov w10, w0
- bn.mov w11, w1
- jal x1, barrett384_p384
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* Exp: 2^12-1 = (0b1111111<<6)+0b111111
- Val: r_12_1 <= r111111^(2^6)*r111111 mod p
- [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
- loopi 6, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w2
- bn.mov w11, w3
- jal x1, barrett384_p384
- bn.mov w4, w16
- bn.mov w5, w17
-
- /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
- Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
- [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
- loopi 12, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w4
- bn.mov w11, w5
- jal x1, barrett384_p384
-
- /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
- Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
- [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
- loopi 6, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w2
- bn.mov w11, w3
- jal x1, barrett384_p384
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* Exp: 2^31-1 <= (2^30-1)*2+0b1
- Val: r_31_1 <= r30_1^2*z mod p
- [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- bn.mov w10, w29
- bn.mov w11, w30
- jal x1, barrett384_p384
- bn.mov w6, w16
- bn.mov w7, w17
-
- /* Exp: 2^32-1 <= (2^30-1)*2+0b1
- Val: r_32_1 <= r31_1^2*z mod p
- [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- bn.mov w10, w29
- bn.mov w11, w30
- jal x1, barrett384_p384
- bn.mov w9, w16
- bn.mov w8, w17
-
- /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
- Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
- [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
- loopi 31, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w6
- bn.mov w11, w7
- jal x1, barrett384_p384
- bn.mov w6, w16
- bn.mov w7,w17
-
- /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
- Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
- [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
- loopi 63, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w6
- bn.mov w11, w7
- jal x1, barrett384_p384
- bn.mov w6, w16
- bn.mov w7, w17
-
- /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
- Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
- [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
- loopi 126, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w6
- bn.mov w11, w7
- jal x1, barrett384_p384
-
- /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
- Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
- [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
- loopi 3, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w0
- bn.mov w11, w1
- jal x1, barrett384_p384
-
- /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
- Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
- [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
- *[w30,w29] mod [w13,w12] */
- loopi 33, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w9
- bn.mov w11, w8
- jal x1, barrett384_p384
- loopi 94, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w2
- bn.mov w11, w3
- jal x1, barrett384_p384
- loopi 2, 4
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
- nop
- bn.mov w10, w29
- bn.mov w11, w30
- jal x1, barrett384_p384
-
- /* store inverse [w1,w0] <= [w17,w16] = z_inv*/
- bn.mov w0, w16
- bn.mov w1, w17
-
- /* convert x-coordinate to affine space
- [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
- bn.mov w10, w25
- bn.mov w11, w26
- jal x1, barrett384_p384
- bn.mov w25, w16
- bn.mov w26, w17
-
- /* convert y-coordinate to affine space
- [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
- bn.mov w10, w27
- bn.mov w11, w28
- bn.mov w16, w0
- bn.mov w17, w1
- jal x1, barrett384_p384
- bn.mov w27, w16
- bn.mov w28, w17
-
- ret
-
-
-/**
- * Fetch curve point from dmem, randomize z-coordinate and store point in dmem
- *
- * returns P = (x, y, z) = (x_a*z, y_a*z, z)
- * with P being a valid P-384 curve point in projective coordinates
- * x_a and y_a being the affine coordinates as fetched from dmem
- * z being a randomized z-coordinate
- *
- * This routines fetches the affine x- and y-coordinates of a curve point from
- * dmem and computes a valid set of projective coordinates. The z-coordinate is
- * randomized and x and y are scaled appropriately. The resulting projective
- * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
- * i.e. each coordinate is stored 512 bit aligned, little endian.
- * This routine runs in constant time.
- *
- * @param[in] x20: dptr_x, pointer to dmem location containing affine
- * x-coordinate of input point
- * @param[in] x21: dptr_y, pointer to dmem location containing affine
- * y-coordinate of input point
- * @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
- * modulus p
- * @param[in] [w13, w12]: p, modulus of P-384 underlying finite field
- * @param[in] w31: all-zero
- * @param[in] x18: dptr_p_p, pointer to dmem location to store resulting point
- * in projective space
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- * the upper limb of projective y-coordinate.
- *
- * clobbered registers: x10, x11 to x13
- * w2, w3, w8 to w11, w16 to w24, w29, w30
- * clobbered flag groups: FG0
- */
-store_proj_randomize:
-
- /* get a 384-bit random number
- [w3, w2] = random(384) */
- bn.wsrr w2, 1
- bn.wsrr w3, 1
- bn.rshi w3, w31, w3 >> 128
-
- /* reduce random number
- [w2, w3] = z <= [w2, w3] mod p */
- bn.sub w10, w2, w12
- bn.subb w11, w3, w13
- bn.sel w2, w2, w10, C
- bn.sel w3, w3, w11, C
-
- bn.mov w10, w2
- bn.mov w11, w3
-
- /* store z-coordinate
- dmem[x20+128] = [w10, w11] */
- li x10, 10
- li x11, 11
- bn.sid x10, 128(x18)
- bn.sid x11, 160(x18)
-
- /* fetch x-coordinate from dmem
- [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
- li x12, 16
- li x13, 17
- bn.lid x12, 0(x20)
- bn.lid x13, 32(x20)
-
- /* scale and store x-coordinate
- [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
- x_p <= [w11, w10] * [w17, w16] = z*x mod p */
-
- jal x1, barrett384_p384
- bn.sid x12, 0(x18)
- bn.sid x13, 32(x18)
-
- /* fetch y-coordinate from dmem
- [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
- bn.lid x12, 0(x21)
- bn.lid x13, 32(x21)
-
- /* scale and store y-coordinate
- [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
- y_p <= [w11, w10] * [w17, w16] = z*y mod p */
- bn.mov w10, w2
- bn.mov w11, w3
- jal x1, barrett384_p384
- bn.sid x12, 64(x18)
- bn.sid x13, 96(x18)
-
- ret
-
-
-/**
- * P-384 scalar point multiplication in affine space
- *
- * returns R = k*P = k*(x_p, y_p)
- * where R, P are valid P-384 curve points in affine coordinates,
- * k is a 384-bit scalar.
- *
- * This routine performs scalar multiplication based on the group laws
- * of Weierstrass curves.
- * A constant time double-and-add algorithm (sometimes referred to as
- * double-and-add-always) is used.
- * Due to the P-384 optimized implementations of the internally called routines
- * for point addition and doubling, this routine is limited to P-384 curves.
- * The routine makes use of blinding by additive splitting the
- * exponent/scalar d into two shares. The double-and-add loop operates on both
- * shares in parallel applying Shamir's trick.
- *
- * @param[in] x9: dptr_rnd, pointer to location in dmem containing random
- * number to be used for additive splitting of scalar
- * @param[in] x19: dptr_k, pointer to scalar k (0 < k < n) in dmem
- * @param[in] x20: dptr_x, pointer to affine x-coordinate in dmem
- * @param[in] x21: dptr_y, pointer to affine y-coordinate in dmem
- * @param[in] x28: dptr_b, pointer to domain parameter b of P-384 in dmem
- * @param[in] x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
- * @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u
- * corresponding to modulus p
- * @param[in] [w13, w12]: p, modulus of P-384 underlying finite field
- * @param[in] [w11, w10]: n, domain parameter of P-384 curve
- * (order of basepoint G)
- * @param[in] w31: all-zero
- * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R.
- * @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R.
- *
- * Scratchpad memory layout:
- * The routine expects at least 704 bytes of scratchpad memory at dmem
- * location 'scratchpad' (sp). Internally the scratchpad is used as follows:
- * dptr_sp .. dptr_sp+191: point P, projective
- * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
- * dptr_sp+256 .. dptr_sp+447: point 2P, projective
- * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
- * dptr_sp+512 .. dptr_sp+703: point Q, projective
- *
- * Projective coordinates of a point are kept in dmem in little endian format
- * with the individual coordinates 512 bit aligned. The coordinates are stored
- * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
- * curve point occupies 6 consecutive 256-bit dmem cells.
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- * the computed affine y-coordinate.
- *
- * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
- * clobbered flag groups: FG0
- */
-scalar_mult_int_p384:
-
- /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
- resetting in very call to point addition routine */
- li x22, 10
- li x23, 11
- li x24, 16
- li x25, 17
-
- /* fetch externally supplied random number from dmem
- [w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */
- li x2, 0
- bn.lid x2++, 0(x9)
- bn.lid x2++, 32(x9)
-
- /* 1st share (reduced rnd)
- s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */
- bn.sub w9, w0, w10
- bn.subb w8, w1, w11
- bn.sel w0, w0, w9, C
- bn.sel w1, w1, w8, C
-
- /* load scalar k from dmem
- [w3, w2] = k <= dmem[dptr_k] = [dmem[x19], dmem[x19+32]] */
- bn.lid x2++, 0(x19)
- bn.lid x2, 32(x19)
-
- /* 2nd share (k-s0)
- s1 = [w3, w2] <= k - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */
- bn.sub w2, w2, w0
- bn.subb w3, w3, w1
- bn.add w8, w2, w10
- bn.addc w9, w3, w11
- bn.sel w2, w8, w2, C
- bn.sel w3, w9, w3, C
-
- /* left align both shares for probing of MSB in loop body */
- bn.rshi w1, w1, w0 >> 128
- bn.rshi w0, w0, w31 >> 128
- bn.rshi w3, w3, w2 >> 128
- bn.rshi w2, w2, w31 >> 128
-
- /* store shares in scratchpad */
- li x2, 0
- bn.sid x2++, 192(x30)
- bn.sid x2++, 224(x30)
- bn.sid x2++, 448(x30)
- bn.sid x2++, 480(x30)
-
- /* get randomized projective coodinates of curve point
- P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
- add x18, x30, 0
- jal x1, store_proj_randomize
-
- /* double point P
- 2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
- add x27, x30, x0
- add x26, x30, x0
- jal x1, proj_add_p384
-
- /* store point 2P in scratchpad @w30+256
- dmem[dptr_sc+256] = [w30:w25] = 2P */
- li x2, 25
- bn.sid x2++, 256(x30)
- bn.sid x2++, 288(x30)
- bn.sid x2++, 320(x30)
- bn.sid x2++, 352(x30)
- bn.sid x2++, 384(x30)
- bn.sid x2++, 416(x30)
-
- /* init point Q = (0,1,0) for double-and-add in scratchpad */
- /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
- addi x26, x30, 512
- li x2, 30
- bn.addi w30, w31, 1
- bn.sid x2++, 64(x26)
- bn.sid x2, 0(x26)
- bn.sid x2, 32(x26)
- bn.sid x2, 96(x26)
- bn.sid x2, 128(x26)
- bn.sid x2, 160(x26)
-
- /* double-and-add loop with decreasing index */
- loopi 384, 85
-
- /* double point Q
- Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
- add x27, x26, x0
- jal x1, proj_add_p384
-
- /* store Q in dmem
- dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
- li x2, 25
- bn.sid x2++, 0(x26)
- bn.sid x2++, 32(x26)
- bn.sid x2++, 64(x26)
- bn.sid x2++, 96(x26)
- bn.sid x2++, 128(x26)
- bn.sid x2++, 160(x26)
-
- /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
- is 1.
- If only one MSb is set, select P for addition.
- If both MSbs are set, select 2P for addition.
- (If neither MSB is set, 2P will be selected but result discarded.) */
- li x2, 0
- bn.lid x2++, 224(x30)
- bn.lid x2, 480(x30)
- bn.xor w8, w0, w1
- /* Create conditional offeset into scratchpad.
- if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
- csrrs x3, 0x7c0, x0
- xori x3, x3, -1
- andi x3, x3, 2
- slli x27, x3, 7
- add x27, x27, x30
-
- /* Reload randomized projective coodinates for curve point P.
- P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
- jal x1, store_proj_randomize
-
- /* Add points Q+P or Q+2P depending on offset in x27.
- Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
- jal x1, proj_add_p384
-
- /* load shares from scratchpad
- [w1, w0] = s0; [w3, w2] = s1 */
- li x2, 0
- bn.lid x2++, 192(x30)
- bn.lid x2++, 224(x30)
- bn.lid x2++, 448(x30)
- bn.lid x2++, 480(x30)
-
- /* M = s0[511] | s1[511] */
- bn.or w8, w1, w3
-
- /* load q from scratchpad
- Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
- li x2, 4
- bn.lid x2++, 0(x26)
- bn.lid x2++, 32(x26)
- bn.lid x2++, 64(x26)
- bn.lid x2++, 96(x26)
- bn.lid x2++, 128(x26)
- bn.lid x2++, 160(x26)
-
- /* select either Q or Q_a
- if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
- bn.sel w25, w25, w4, M
- bn.sel w26, w26, w5, M
- bn.sel w27, w27, w6, M
- bn.sel w28, w28, w7, M
- bn.sel w29, w29, w8, M
- bn.sel w30, w30, w9, M
-
- /* store Q in dmem
- dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
- li x2, 25
- bn.sid x2++, 0(x26)
- bn.sid x2++, 32(x26)
- bn.sid x2++, 64(x26)
- bn.sid x2++, 96(x26)
- bn.sid x2++, 128(x26)
- bn.sid x2++, 160(x26)
-
- /* left shift both shares
- s0 <= s0 << 1 ; s1 <= s1 << 1 */
- bn.add w0, w0, w0
- bn.addc w1, w1, w1
- bn.add w2, w2, w2
- bn.addc w3, w3, w3
- /* store both shares in scratchpad */
- li x2, 0
- bn.sid x2++, 192(x30)
- bn.sid x2++, 224(x30)
- bn.sid x2++, 448(x30)
- bn.sid x2++, 480(x30)
-
-
- /* Get a fresh random number and scale the coordinates of 2P.
- (scaling each proj. coordinate by same factor results in same point) */
-
- /* get a 384-bit random number */
- bn.wsrr w2, 1
- bn.wsrr w3, 1
- bn.rshi w3, w31, w3 >> 128
-
- /* reduce random number
- [w2, w3] = z <= [w2, w3] mod p */
- bn.sub w10, w2, w12
- bn.subb w11, w3, w13
- bn.sel w2, w2, w10, C
- bn.sel w3, w3, w11, C
-
- /* scale all coordinates in scratchpad */
- li x2, 16
- li x3, 17
- /* x-coordinate */
- bn.mov w10, w2
- bn.mov w11, w3
- bn.lid x2, 256(x30)
- bn.lid x3, 288(x30)
- jal x1, barrett384_p384
- bn.sid x2, 256(x30)
- bn.sid x3, 288(x30)
- /* y-coordinate */
- bn.mov w10, w2
- bn.mov w11, w3
- bn.lid x2, 320(x30)
- bn.lid x3, 352(x30)
- jal x1, barrett384_p384
- bn.sid x2, 320(x30)
- bn.sid x3, 352(x30)
- /* z-coordinate */
- bn.mov w10, w2
- bn.mov w11, w3
- bn.lid x2, 384(x30)
- bn.lid x3, 416(x30)
- jal x1, barrett384_p384
- bn.sid x2, 384(x30)
- bn.sid x3, 416(x30)
-
- /* convert coordinates to affine space */
- jal x1, proj_to_affine_p384
-
- ret
-
-
-/**
- * Externally callable wrapper for P-384 scalar point multiplication
- *
- * returns R = k*P = k*(x_p, y_p)
- * where R, P are valid P-384 curve points in affine coordinates,
- * k is a 384-bit scalar..
- *
- * Sets up context and calls internal scalar multiplication routine.
- * This routine runs in constant time.
- *
- * @param[in] dmem[0]: dK, pointer to location in dmem containing scalar k
- * @param[in] dmem[4]: dRnd, pointer to location in dmem containing random
- * number for blinding
- * @param[in] dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
- * @param[in] dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
- *
- * 384-bit quantities have to be provided in dmem in little-endian format,
- * 512 bit aligned, with the highest 128 bit set to zero.
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- * the computed affine y-coordinate.
- *
- * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
- * w0 to w30
- * clobbered flag groups: FG0
- */
-.globl scalar_mult_p384
-scalar_mult_p384:
-
- /* set dmem pointer to point x-coordinate */
- la x20, dptr_x
- lw x20, 0(x20)
-
- /* set dmem pointer to point y-coordinate */
- la x21, dptr_y
- lw x21, 0(x21)
-
- /* set dmem pointer to scalar k */
- la x19, dptr_k
- lw x19, 0(x19)
-
- /* set pointer to blinding parameter */
- la x9, dptr_rnd
- lw x9, 0(x9)
-
- /* set dmem pointer to domain parameter b */
- la x28, p384_b
-
- /* set dmem pointer to scratchpad */
- la x30, scratchpad
-
- /* load domain parameter p (modulus)
- [w13, w12] = p = dmem[p384_p] */
- li x2, 12
- la x3, p384_p
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* load Barrett constant u for modulus p
- [w15, w14] = u_p = dmem[p384_u_p] */
- li x2, 14
- la x3, p384_u_p
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* load domain parameter n (order of base point)
- [w11, w10] = n = dmem[p384_n] */
- li x2, 10
- la x3, p384_n
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* init all-zero reg */
- bn.xor w31, w31, w31
-
- jal x1, scalar_mult_int_p384
-
- ret
-
-
-/**
- * Variable-time modular multiplicative inverse computation
- *
- * returns x_inv = x^-1 mod m
- *
- * This routine computes the modular multiplicative inverse for any x < m in
- * the finite field GF(m) where m is prime.
- *
- * For inverse computation, Fermat's little theorem is used, i.e.
- * we compute x^-1 = x^(m-2) mod m.
- * For exponentiation we use a standard, variable-time (!) square and multiply
- * algorithm.
- *
- * This routine is mainly intended to be used for inversion of scalars in
- * context of the P-384 curve. In theory, it can be used with any 384-bit
- * modulus m with a corresponding 385-bit Barrett constant u,
- * where u[383:192] = 0.
- *
- * Note: When used for P-384 scalar inversion, the routine will need 672 calls
- * to the multiplication routine. By using an adder chain this could be reduced
- * to ~433 multiplications, however, at the cost of a significant codes size
- * increase.
- *
- * Note: This routine runs in variable-time w.r.t. the modulus. It should only
- * be used with a non-secret modulus.
- *
- * @param[in] [w13, w12]: m, 384 bit modulus
- * @param[in] [w15, w14]: u[383:0], lower 384 bit of pre-computed Barrett
- * constant corresponding to modulus m
- * @param[in] [w30, w29]: x, 384 bit operand
- * @param[in] w31, all-zero
- * @param[out] [w17, w16]: x_inv, modular multiplicative inverse
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * clobbered registers: x2, w2, w3, w10, w11, w16 to w24
- * clobbered flag groups: FG0
- */
-mod_inv_n_p384:
-
- /* subtract 2 from modulus for Fermat's little theorem
- [w13,w12] <= m - 2 = [w11,w10]-2 (left aligned) */
- bn.subi w2, w12, 2
- bn.subb w3, w13, w31
- bn.rshi w3, w3, w2 >> 128
- bn.rshi w2, w2, w31 >> 128
-
- /* init square and multiply: [w17,w16] = 1 */
- bn.addi w16, w31, 1
- bn.mov w17, w31
-
- /* square and multiply loop */
- loopi 384, 12
-
- /* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */
- bn.mov w10, w16
- bn.mov w11, w17
- jal x1, barrett384_p384
-
- /* shift MSB into carry flag
- [w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */
- bn.add w2, w2, w2
- bn.addc w3, w3, w3
-
- /* skip multiplication if C flag not set */
- csrrs x2, 1984, x0
- andi x2, x2, 1
- beq x2, x0, nomul
-
- /* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */
- bn.mov w10, w29
- bn.mov w11, w30
- jal x1, barrett384_p384
-
- nomul:
- nop
-
- ret
-
-
-/**
- * P-384 ECDSA signature generation
- *
- * returns the signature as the pair r, s with
- * r = x_1 mod n
- * and s = k^(-1)(msg + r*d) mod n
- * where x_1 is the affine x-coordinate of the curve point k*G,
- * G is the curve's base point,
- * k is a supplied secret random number,
- * n is the order of the base point G of P-256,
- * msg is the message to be signed, and
- * d is the private key.
- *
- * This routine runs in constant time.
- *
- * @param[in] dmem[0]: dptr_k, pointer to a 384 bit random secret in dmem
- * @param[in] dmem[4]: dptr_rnd, pointer to location in dmem containing
- * a 384-bit random number for blinding
- * @param[in] dmem[8]: dptr_msg, pointer to the message to be signed in dmem
- * @param[in] dmem[12]: dptr_r, pointer to dmem location where s component
- * of signature will be placed
- * @param[in] dmem[16]: dptr_s, pointer to dmem location where r component
- * of signature will be placed
- * @param[in] dmem[28]: dptr_d, pointer to private key d in dmem
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * clobbered registers: x2, x3, x9 to x13, x18 to x28, x30
- * w0 to w31
- * clobbered flag groups: FG0
- */
-.globl p384_sign
-p384_sign:
- /* init all-zero reg */
- bn.xor w31, w31, w31
-
- /* set dmem pointer to domain parameter b */
- la x28, p384_b
-
- /* set dmem pointer to basepoint x-coordinate */
- la x20, p384_gx
-
- /* set dmem pointer to basepoint y-coordinate */
- la x21, p384_gy
-
- /* set dmem pointer to secret random scalar k */
- la x19, dptr_k
- lw x19, 0(x19)
-
- /* set pointer to blinding parameter */
- la x9, dptr_rnd
- lw x9, 0(x9)
-
- /* set dmem pointer to scratchpad */
- la x30, scratchpad
-
- /* load domain parameter p (modulus)
- [w13, w12] <= p = dmem[dptr_p] */
- li x2, 12
- la x3, p384_p
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* load Barrett constant u for modulus p
- [w15, w14] = u_p = dmem[p384_u_p] */
- li x2, 14
- la x3, p384_u_p
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* load domain parameter n (order of base point)
- [w11, w10] = n = dmem[p384_n] */
- li x2, 10
- la x3, p384_n
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* scalar multiplication with base point
- [w28:w25] <= (x_1, y_1) = k*G */
- jal x1, scalar_mult_int_p384
-
- /* store r of signature in dmem: dmem[dptr_r] <= r = [w26,w25] */
- li x2, 25
- la x3, dptr_r
- lw x3, 0(x3)
- bn.sid x2++, 0(x3)
- bn.sid x2++, 32(x3)
-
- /* load secret random number k from dmem
- [w30,w29] <= k = dmem[dptr_k] */
- li x2, 29
- bn.lid x2++, 0(x19)
- bn.lid x2++, 32(x19)
-
- /* load domain parameter n (order of base point)
- [w13, w12] <= p = dmem[p384_n] */
- li x2, 12
- la x3, p384_n
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* load Barrett constant u_n for modulus n for scalar operations
- [w15, w14] <= u_m = dmem[p384_u_n] */
- li x2, 14
- la x3, p384_u_n
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* modular multiplicative inverse of k
- [w3, w2] <= [w17, w16] <= k^(-1) mod n */
- jal x1, mod_inv_n_p384
- bn.mov w2, w16
- bn.mov w3, w17
-
- /* load private key d from dmem
- [w11,w10] <= d = dmem[dptr_d] */
- li x2, 10
- la x3, dptr_d
- lw x3, 0(x3)
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* [w17, w16] <= k^(-1)*d mod n = [w17, w16] * [w11, w10] mod [w13, w12] */
- jal x1, barrett384_p384
-
- /* [w5, w4] <= [w17, w16]
- <= r * (k^(-1)*d) mod n = [w26, w25] * [w17, w16] mod [w13, w12] */
- bn.mov w10, w25
- bn.mov w11, w26
- jal x1, barrett384_p384
- bn.mov w4, w16
- bn.mov w5, w17
-
- /* load message from dmem
- [w11, w10] <= msg = dmem[dptr_msg] */
- li x2, 10
- la x3, dptr_msg
- lw x3, 0(x3)
- bn.lid x2++, 0(x3)
- bn.lid x2++, 32(x3)
-
- /* [w17, w16] <= k^(-1) * msg = [w3, w2]*[w17, w16] mod n */
- bn.mov w16, w2
- bn.mov w17, w3
- jal x1, barrett384_p384
-
- /* [w28, w27] <= s' = k^(-1)*msg + k^(-1)*r*d = [w17, w16] + [w5, w4]*/
- bn.add w27, w16, w4
- bn.addc w28, w17, w5
-
- /* reduce s: [w28, w27] <= s <= s' mod n = [w28, w27] mod [w13, w12] */
- bn.sub w10, w27, w12
- bn.subb w11, w28, w13
- bn.sel w27, w27, w10, C
- bn.sel w28, w28, w11, C
-
- /* store s of signature in dmem: dmem[dptr_s] <= s = [w28, w27] */
- li x2, 27
- la x3, dptr_s
- lw x3, 0(x3)
- bn.sid x2++, 0(x3)
- bn.sid x2++, 32(x3)
-
- ret
-
-
-/* constants, pointers and scratchpad memory */
-.section .data
-
-/* pointer to k (dptr_k) */
-.globl dptr_k
-dptr_k:
- .zero 4
-
-/* pointer to rnd (dptr_rnd) */
-.globl dptr_rnd
-dptr_rnd:
- .zero 4
-
-/* pointer to msg (dptr_msg) */
-.globl dptr_msg
-dptr_msg:
- .zero 4
-
-/* pointer to R (dptr_r) */
-.globl dptr_r
-dptr_r:
- .zero 4
-
-/* pointer to S (dptr_s) */
-.globl dptr_s
-dptr_s:
- .zero 4
-
-/* pointer to X (dptr_x) */
-.globl dptr_x
-dptr_x:
- .zero 4
-
-/* pointer to Y (dptr_y) */
-.globl dptr_y
-dptr_y:
- .zero 4
-
-/* pointer to D (dptr_d) */
-.globl dptr_d
-dptr_d:
- .zero 4
-
-/* P-384 domain parameter b */
-.globl p384_b
-p384_b:
- .word 0xd3ec2aef
- .word 0x2a85c8ed
- .word 0x8a2ed19d
- .word 0xc656398d
- .word 0x5013875a
- .word 0x0314088f
- .word 0xfe814112
- .word 0x181d9c6e
- .word 0xe3f82d19
- .word 0x988e056b
- .word 0xe23ee7e4
- .word 0xb3312fa7
- .zero 16
-
-/* P-384 domain parameter p (modulus) */
-.globl p384_p
-p384_p:
- .word 0xffffffff
- .word 0x00000000
- .word 0x00000000
- .word 0xffffffff
- .word 0xfffffffe
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .zero 16
-
-/* barrett constant u for modulus p */
-.globl p384_u_p
-p384_u_p:
- .word 0x00000001
- .word 0xffffffff
- .word 0xffffffff
- .word 0x00000000
- .word 0x00000001
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .zero 16
-
-/* P-384 domain parameter n (order of base point) */
-p384_n:
- .word 0xccc52973
- .word 0xecec196a
- .word 0x48b0a77a
- .word 0x581a0db2
- .word 0xf4372ddf
- .word 0xc7634d81
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .word 0xffffffff
- .zero 16
-
-/* barrett constant u for n */
-p384_u_n:
- .word 0x333ad68d
- .word 0x1313e695
- .word 0xb74f5885
- .word 0xa7e5f24d
- .word 0x0bc8d220
- .word 0x389cb27e
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .word 0x00000000
- .zero 16
-
-/* P-384 basepoint G affine x-coordinate */
-p384_gx:
- .word 0x72760ab7
- .word 0x3a545e38
- .word 0xbf55296c
- .word 0x5502f25d
- .word 0x82542a38
- .word 0x59f741e0
- .word 0x8ba79b98
- .word 0x6e1d3b62
- .word 0xf320ad74
- .word 0x8eb1c71e
- .word 0xbe8b0537
- .word 0xaa87ca22
- .zero 16
-
-/* P-384 basepoint G affine y-coordinate */
-p384_gy:
- .word 0x90ea0e5f
- .word 0x7a431d7c
- .word 0x1d7e819d
- .word 0x0a60b1ce
- .word 0xb5f0b8c0
- .word 0xe9da3113
- .word 0x289a147c
- .word 0xf8f41dbd
- .word 0x9292dc29
- .word 0x5d9e98bf
- .word 0x96262c6f
- .word 0x3617de4a
- .zero 16
-
-/* 704 bytes of scratchpad memory */
-scratchpad:
- .zero 704
diff --git a/sw/otbn/code-snippets/p384_base.s b/sw/otbn/code-snippets/p384_base.s
new file mode 100644
index 0000000..f096167
--- /dev/null
+++ b/sw/otbn/code-snippets/p384_base.s
@@ -0,0 +1,832 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ * This library contains:
+ * - P-384 specific routines for point addition in projective space
+ * - P-384 domain parameters
+ */
+
+ .section .text
+
+/**
+ * 384-bit modular multiplication based on Barrett reduction algorithm
+ * optimized for the special modulus of the NIST P-384 curve.
+ *
+ * Returns c = a x b % p.
+ *
+ * Expects: two operands, modulus p and pre-calculated parameter u for barrett
+ * reduction (usually greek mu in literature). u is expected without the
+ * leading 1 at bit 384. u has to be pre-calculated as u = floor(2^768/p).
+ *
+ * This implementation mostly follows the description in the
+ * "Handbook of Applied Cryptography" in Algorithm 14.42.
+ * Differences:
+ * - This implementation incorporates a multiplication before the reduction.
+ * Therefore it expects two operands (a, b) instead of a wider integer x.
+ * - The computation of q2 ignores the MSbs of q1 and u to allow using
+ * a 384x384 bit multiplication. This is compensated later by
+ * individual (conditional) additions.
+ * - The truncations in step 2 of HAC 14.42 in the form of (... mod b^(k+1) )
+ * are not implemented here and the full register width is used. This
+ * allows to omit computation of r1 (since r1=x) and step 3 of HAC 14.42
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] [w11, w10]: a, first operand, max. length 384 bit, a < m.
+ * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
+ * @param[in] [w13, w12]: p, modulus of P384 i.e.:
+ m = 2^384 - 2^128 - 2^96 + 2^32 - 1.
+ * @param[in] [w15, w14]: u, pre-computed Barrett constant (without u[384]/MSb
+ * of u which is always 1 for the allowed range but
+ * has to be set to 0 here).
+ * @param[in] w31: all-zero.
+ * @param[out] [w17, w16]: c, result, max. length 384 bit.
+ *
+ * Clobbered registers: w10, w11, w16, w17, w18, w19, w20, w21, w22, w23, w24
+ * Clobbered flag groups: FG0
+ */
+ .globl barrett384_p384
+barrett384_p384:
+ /* Compute the integer product of the operands x = a * b
+ x = [w18, w22, w21] = a * b = [w11, w10] * [w17, w16]
+ => max. length x: 768 bit */
+ bn.mulqacc.z w10.0, w16.0, 0
+ bn.mulqacc w10.0, w16.1, 64
+ bn.mulqacc.so w21.L, w10.1, w16.0, 64
+ bn.mulqacc w10.0, w16.2, 0
+ bn.mulqacc w10.1, w16.1, 0
+ bn.mulqacc w10.2, w16.0, 0
+ bn.mulqacc w10.0, w16.3, 64
+ bn.mulqacc w10.1, w16.2, 64
+ bn.mulqacc w10.2, w16.1, 64
+ bn.mulqacc.so w21.U, w10.3, w16.0, 64
+ bn.mulqacc w10.0, w17.0, 0
+ bn.mulqacc w10.1, w16.3, 0
+ bn.mulqacc w10.2, w16.2, 0
+ bn.mulqacc w10.3, w16.1, 0
+ bn.mulqacc w11.0, w16.0, 0
+ bn.mulqacc w10.0, w17.1, 64
+ bn.mulqacc w10.1, w17.0, 64
+ bn.mulqacc w10.2, w16.3, 64
+ bn.mulqacc w10.3, w16.2, 64
+ bn.mulqacc w11.0, w16.1, 64
+ bn.mulqacc.so w22.L, w11.1, w16.0, 64
+ bn.mulqacc w10.1, w17.1, 0
+ bn.mulqacc w10.2, w17.0, 0
+ bn.mulqacc w10.3, w16.3, 0
+ bn.mulqacc w11.0, w16.2, 0
+ bn.mulqacc w11.1, w16.1, 0
+ bn.mulqacc w10.2, w17.1, 64
+ bn.mulqacc w10.3, w17.0, 64
+ bn.mulqacc w11.0, w16.3, 64
+ bn.mulqacc.so w22.U, w11.1, w16.2, 64
+ bn.mulqacc w10.3, w17.1, 0
+ bn.mulqacc w11.0, w17.0, 0
+ bn.mulqacc w11.1, w16.3, 0
+ bn.mulqacc w11.0, w17.1, 64
+ bn.mulqacc.so w18.L, w11.1, w17.0, 64
+ bn.mulqacc.so w18.U, w11.1, w17.1, 0
+
+ /* Store correction factor to compensate for later neglected MSb of x.
+ x is 768 bit wide and therefore the 383 bit right shifted version q1
+ (below) contains 385 bit. Bit 384 of q1 is neglected to allow using a
+ 384x384 multiplier. For the MSb of x being set we temporary store u
+ (or zero) here to be used in a later constant time correction of a
+ multiplication with u. Note that this requires the MSb flag being carried
+ over from the multiplication routine. */
+ bn.sel w23, w14, w31, M
+ bn.sel w24, w15, w31, M
+
+ /* Compute q1 = x >> 383
+ q1 = [w11, w10] = [w18, w22, w21] >> 383 = [w18, w21] >> 127
+ => max length q1: 385 bits */
+ bn.rshi w11, w31, w18 >> 127
+ bn.rshi w10, w18, w22 >> 127
+
+ /* Compute q2 = q1*u
+ Instead of full q2 (which would be up to 770 bits) we ignore the MSb of u
+ and the MSb of q1 and correct this later. This allows using a 384x384
+ multiplier. We use the property that u for the modulus of P384 is zero in
+ the bits 383 downto 129 and use a 384x192 multiplication routine.
+ => max. length q2': 513 bit
+ q2' = q1[383:0]*u[128:0] = [w18, w17, w16] = [w11, w10] * [w15, w14] */
+
+ /* 576 = 384*192 bit multiplication kernel */
+ bn.mulqacc.z w10.0, w14.0, 0
+ bn.mulqacc w10.0, w14.1, 64
+ bn.mulqacc.so w16.L, w10.1, w14.0, 64
+ bn.mulqacc w10.0, w14.2, 0
+ bn.mulqacc w10.1, w14.1, 0
+ bn.mulqacc w10.2, w14.0, 0
+ bn.mulqacc w10.1, w14.2, 64
+ bn.mulqacc w10.2, w14.1, 64
+ bn.mulqacc.so w16.U, w10.3, w14.0, 64
+ bn.mulqacc w10.2, w14.2, 0
+ bn.mulqacc w10.3, w14.1, 0
+ bn.mulqacc w11.0, w14.0, 0
+ bn.mulqacc w10.3, w14.2, 64
+ bn.mulqacc w11.0, w14.1, 64
+ bn.mulqacc.so w17.L, w11.1, w14.0, 64
+ bn.mulqacc w11.0, w14.2, 0
+ bn.mulqacc w11.1, w14.1, 0
+ bn.mulqacc.so w17.U, w11.1, w14.2, 64
+
+ /* w14.3 is always zero here due to structure of Barrett constant */
+ bn.mulqacc.wo w18, w11.1, w14.3, 64
+
+ /* q3 = q2 >> 385
+ In this step, the compensation for the neglected MSbs of q1 and u is
+ carried out along the way. To add them in the q2 domain, they would have
+ to be left shifted by 384 bit first. To directly add them we first shift
+ q2' by 384 bit to the right, perform the additions, and shift the result
+ another bit to the right. The additions cannot overflow due to leading
+ zeros after shift.
+ q2'' = q2' >> 384 = [w20, w19] = [w18, w17, w16] >> 384
+ = [w18, w17] >> 128 */
+ bn.rshi w20, w31, w18 >> 128
+ bn.rshi w19, w18, w17 >> 128
+ /* Add q1. This is unconditional since MSb of u is always 1.
+ This cannot overflow due to leading zeros.
+ q2''' = q2'' + q1 = [w20, w19] = [w20, w19] + [w10, w11] */
+ bn.add w19, w19, w10
+ bn.addc w20, w20, w11
+ /* Conditionally add u (without leading 1) in case of MSb of x being set.
+ This is the "real" q2 but shifted by 384 bits to the right. This cannot
+ overflow due to leading zeros.
+ q2'''' = x[767]?q2'''+u[383:0]:q2'''
+ = [w20, w19] + [w24, w23] = q2 >> 384 */
+ bn.add w19, w19, w23
+ bn.addc w20, w20, w24
+ /* finally this gives q3 by shifting the remaining bit to the right
+ q3 = q2 >> 385 = q2'''' >> 1 = [w11, w10] = [w20, w19] >> 1 */
+ bn.rshi w11, w31, w20 >> 1
+ bn.rshi w10, w20, w19 >> 1
+
+ /* r2 = q3*m[511:0] = [w17, w16] = ([w11, w10] * [w13, w12])[511:0]
+ A 384x384 bit multiplication kernel is used here, hence both q3 and p
+ must not be wider than 384 bit. This is always the case for p. For q3 it
+ is the case if a<p and b<p.
+ The 256 highest bits of the multiplication result are not needed,
+ so we do not compute them. */
+ bn.mulqacc.z w10.0, w12.0, 0
+ bn.mulqacc w10.0, w12.1, 64
+ bn.mulqacc.so w16.L, w10.1, w12.0, 64
+ bn.mulqacc w10.0, w12.2, 0
+ bn.mulqacc w10.1, w12.1, 0
+ bn.mulqacc w10.2, w12.0, 0
+ bn.mulqacc w10.0, w12.3, 64
+ bn.mulqacc w10.1, w12.2, 64
+ bn.mulqacc w10.2, w12.1, 64
+ bn.mulqacc.so w16.U, w10.3, w12.0, 64
+ bn.mulqacc w10.0, w13.0, 0
+ bn.mulqacc w10.1, w12.3, 0
+ bn.mulqacc w10.2, w12.2, 0
+ bn.mulqacc w10.3, w12.1, 0
+ bn.mulqacc w11.0, w12.0, 0
+ bn.mulqacc w10.0, w13.1, 64
+ bn.mulqacc w10.1, w13.0, 64
+ bn.mulqacc w10.2, w12.3, 64
+ bn.mulqacc w10.3, w12.2, 64
+ bn.mulqacc w11.0, w12.1, 64
+ bn.mulqacc.so w17.L, w11.1, w12.0, 64
+ bn.mulqacc w10.1, w13.1, 0
+ bn.mulqacc w10.2, w13.0, 0
+ bn.mulqacc w10.3, w12.3, 0
+ bn.mulqacc w11.0, w12.2, 0
+ bn.mulqacc w11.1, w12.1, 0
+ bn.mulqacc w10.2, w13.1, 64
+ bn.mulqacc w10.3, w13.0, 64
+ bn.mulqacc w11.0, w12.3, 64
+ bn.mulqacc.so w17.U, w11.1, w12.2, 64
+
+ /* Compute r = x-r2 = x-q3*p
+ since 0 <= r < 3*p, we only need to consider the lower limbs of x and r2
+ r[511:0] = [w22, w21] - [w17, w16] */
+ bn.sub w21, w21, w16
+ bn.subb w22, w22, w17
+
+ /* Barrett algorithm requires subtraction of the modulus at most two times if
+ result is too large. However in the special case of P-384 we need to
+ subtract only once */
+ bn.sub w16, w21, w12
+ bn.subb w17, w22, w13
+ bn.sel w16, w21, w16, C
+ bn.sel w17, w22, w17, C
+
+ /* return result: c =[w17, w16] = a * b % p. */
+ ret
+
+
+/**
+ * P-384 point addition in projective space
+ *
+ * returns R = (x_r, y_r, z_r) <= P+Q = (x_p, y_p, z_p) + (x_q, y_q, z_q)
+ * with R, P and Q being valid P-384 curve points
+ * in projective coordinates
+ *
+ * This routine adds two valid P-384 curve points in projective space.
+ * Point addition is performed based on the complete formulas of Bosma and
+ * Lenstra for Weierstrass curves as first published in [1] and
+ * optimized in [2].
+ * The implemented version follows Algorithm 4 of [2] which is an optimized
+ * variant for Weierstrass curves with domain parameter 'a' set to a=-3.
+ * Numbering of the steps below and naming of symbols follows the
+ * terminology of Algorithm 4 of [2].
+ * The routine is limited to P-384 curve points due to:
+ * - fixed a=-3 domain parameter
+ * - usage of a P-384 optimized Barrett multiplication kernel
+ * This routine runs in constant time.
+ *
+ * [1] https://doi.org/10.1006/jnth.1995.1088
+ * [2] https://doi.org/10.1007/978-3-662-49890-3_16
+ *
+ * @param[in] x22: set to 10, pointer to in reg for Barrett routine
+ * @param[in] x23: set to 11, pointer to in reg for Barrett routine
+ * @param[in] x24: set to 16, pointer to in/out reg for Barrett routine
+ * @param[in] x25: set to 17, pointer to in/out reg for Barrett routine
+ * @param[in] x26: dptr_p_p, dmem pointer to point P in dmem (projective)
+ * @param[in] x27: dptr_q_p, dmem pointer to point Q in dmem (projective)
+ * @param[in] x28: dptr_b, dmem pointer to domain parameter b of P-384 in dmem
+ * @param[in] [w13, w12]: p, modulus of underlying field of P-384
+ * @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
+ * modulus p
+ * @param[in] w31: all-zero.
+ * @param[out] [w26, w25]: x_r, x-coordinate of resulting point R
+ * @param[out] [w28, w27]: y_r, y-coordinate of resulting point R
+ * @param[out] [w30, w29]: z_r, z-coordinate of resulting point R
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: w0 to w30
+ * clobbered flag groups: FG0
+ */
+.globl proj_add_p384
+proj_add_p384:
+ /* mapping of parameters to symbols of [2] (Algorithm 4):
+ X1 = x_p; Y1 = y_p; Z1 = z_p; X2 = x_q; Y2 = y_q; Z2 = z_q
+ X3 = x_r; Y3 = y_r; Z3 = z_r */
+
+ /* 1: [w1, w0] = t0 <= X1*X2 = dmem[x26+0]*dmem[x27+0] */
+ bn.lid x22, 0(x26)
+ bn.lid x23, 32(x26)
+ bn.lid x24, 0(x27)
+ bn.lid x25, 32(x27)
+ jal x1, barrett384_p384
+ bn.mov w0, w16
+ bn.mov w1, w17
+
+ /* 2: [w3, w2] = t1 <= Y1*Y2 = dmem[x26+64]*dmem[x27+64] */
+ bn.lid x22, 64(x26)
+ bn.lid x23, 96(x26)
+ bn.lid x24, 64(x27)
+ bn.lid x25, 96(x27)
+ jal x1, barrett384_p384
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* 3: [w5, w4] = t2 <= Z1*Z2 = dmem[x26+128]*dmem[x27+128] */
+ bn.lid x22, 128(x26)
+ bn.lid x23, 160(x26)
+ bn.lid x24, 128(x27)
+ bn.lid x25, 160(x27)
+ jal x1, barrett384_p384
+ bn.mov w4, w16
+ bn.mov w5, w17
+
+ /* 4: [w7, w6] = t3 <= X1+Y1 = dmem[x26+0]+dmem[x26+64] */
+ bn.lid x22, 0(x26)
+ bn.lid x23, 32(x26)
+ bn.lid x24, 64(x26)
+ bn.lid x25, 96(x26)
+ bn.add w16, w10, w16
+ bn.addc w17, w11, w17
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w6, w16
+ bn.mov w7, w17
+
+ /* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */
+ bn.lid x22, 0(x27)
+ bn.lid x23, 32(x27)
+ bn.lid x24, 64(x27)
+ bn.lid x25, 96(x27)
+ bn.add w16, w10, w16
+ bn.addc w17, w11, w17
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w8, w16
+ bn.mov w9, w17
+
+ /* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */
+ bn.mov w10, w6
+ bn.mov w11, w7
+ bn.mov w16, w8
+ bn.mov w17, w9
+ jal x1, barrett384_p384
+ bn.mov w6, w16
+ bn.mov w7, w17
+
+ /* 7: [w9, w8] = t4 <= t0+t1 = [w1, w0]+[w3, w2] */
+ bn.add w16, w0, w2
+ bn.addc w17, w1, w3
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w8, w16
+ bn.mov w9, w17
+
+ /* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */
+ bn.sub w16, w6, w8
+ bn.subb w17, w7, w9
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w6, w16
+ bn.mov w7, w17
+
+ /* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */
+ bn.lid x22, 64(x26)
+ bn.lid x23, 96(x26)
+ bn.lid x24, 128(x26)
+ bn.lid x25, 160(x26)
+ bn.add w16, w10, w16
+ bn.addc w17, w11, w17
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w8, w16
+ bn.mov w9, w17
+
+ /* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */
+ bn.lid x22, 64(x27)
+ bn.lid x23, 96(x27)
+ bn.lid x24, 128(x27)
+ bn.lid x25, 160(x27)
+ bn.add w16, w10, w16
+ bn.addc w17, w11, w17
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */
+ bn.mov w10, w8
+ bn.mov w11, w9
+ bn.mov w16, w25
+ bn.mov w17, w26
+ jal x1, barrett384_p384
+ bn.mov w8, w16
+ bn.mov w9, w17
+
+ /* 12: [w26, w25] = X3 <= t1+t2 = [w3, w2]+[w5, w4] */
+ bn.add w16, w2, w4
+ bn.addc w17, w3, w5
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */
+ bn.sub w16, w8, w25
+ bn.subb w17, w9, w26
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w8, w16
+ bn.mov w9, w17
+
+ /* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */
+ bn.lid x22, 0(x26)
+ bn.lid x23, 32(x26)
+ bn.lid x24, 128(x26)
+ bn.lid x25, 160(x26)
+ bn.add w16, w10, w16
+ bn.addc w17, w11, w17
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */
+ bn.lid x22, 0(x27)
+ bn.lid x23, 32(x27)
+ bn.lid x24, 128(x27)
+ bn.lid x25, 160(x27)
+ bn.add w16, w10, w16
+ bn.addc w17, w11, w17
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */
+ bn.mov w10, w25
+ bn.mov w11, w26
+ bn.mov w16, w27
+ bn.mov w17, w28
+ jal x1, barrett384_p384
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 17: [w28, w27] = Y3 <= t0+t2 = [w1, w0]+[w5, w4] */
+ bn.add w16, w0, w4
+ bn.addc w17, w1, w5
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */
+ bn.sub w16, w25, w27
+ bn.subb w17, w26, w28
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */
+ bn.lid x22, 0(x28)
+ bn.lid x23, 32(x28)
+ bn.mov w16, w4
+ bn.mov w17, w5
+ jal x1, barrett384_p384
+ bn.mov w29, w16
+ bn.mov w30, w17
+
+ /* 20: [w26, w25] = X3 <= Y3-Z3 = [w28, w27]-[w30, w29] */
+ bn.sub w16, w27, w29
+ bn.subb w17, w28, w30
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */
+ bn.add w16, w25, w25
+ bn.addc w17, w26, w26
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w29, w16
+ bn.mov w30, w17
+
+ /* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */
+ bn.add w16, w25, w29
+ bn.addc w17, w26, w30
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */
+ bn.sub w16, w2, w25
+ bn.subb w17, w3, w26
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w29, w16
+ bn.mov w30, w17
+
+ /* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */
+ bn.add w16, w2, w25
+ bn.addc w17, w3, w26
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */
+ bn.lid x22, 0(x28)
+ bn.lid x23, 32(x28)
+ bn.mov w16, w27
+ bn.mov w17, w28
+ jal x1, barrett384_p384
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 26: [w3, w2] = t1 <= t2+t2 = [w5, w4]+[w5, w4] */
+ bn.add w16, w4, w4
+ bn.addc w17, w5, w5
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */
+ bn.add w16, w2, w4
+ bn.addc w17, w3, w5
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w4, w16
+ bn.mov w5, w17
+
+ /* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */
+ bn.sub w16, w27, w4
+ bn.subb w17, w28, w5
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */
+ bn.sub w16, w27, w0
+ bn.subb w17, w28, w1
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */
+ bn.add w16, w27, w27
+ bn.addc w17, w28, w28
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */
+ bn.add w16, w2, w27
+ bn.addc w17, w3, w28
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */
+ bn.add w16, w0, w0
+ bn.addc w17, w1, w1
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */
+ bn.add w16, w2, w0
+ bn.addc w17, w3, w1
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w0, w16
+ bn.mov w1, w17
+
+ /* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */
+ bn.sub w16, w0, w4
+ bn.subb w17, w1, w5
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w0, w16
+ bn.mov w1, w17
+
+ /* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */
+ bn.mov w10, w8
+ bn.mov w11, w9
+ bn.mov w16, w27
+ bn.mov w17, w28
+ jal x1, barrett384_p384
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* 36: [w5, w4] = t2 <= t0*Y3 = [w1, w0]*[w28, w27] */
+ bn.mov w10, w0
+ bn.mov w11, w1
+ bn.mov w16, w27
+ bn.mov w17, w28
+ jal x1, barrett384_p384
+ bn.mov w4, w16
+ bn.mov w5, w17
+
+ /* 37: [w28, w27] = Y3 <= X3*Z3 = [w26, w25]*[w30, w29] */
+ bn.mov w10, w25
+ bn.mov w11, w26
+ bn.mov w16, w29
+ bn.mov w17, w30
+ jal x1, barrett384_p384
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 38: [w28, w27] = Y3 <= Y3+t2 = [w28, w27]+[w5, w4] */
+ bn.add w16, w27, w4
+ bn.addc w17, w28, w5
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ /* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */
+ bn.mov w10, w6
+ bn.mov w11, w7
+ bn.mov w16, w25
+ bn.mov w17, w26
+ jal x1, barrett384_p384
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 40: [w26, w25] = X3 <= X3-t1 = [w26, w25]-[w3, w2] */
+ bn.sub w16, w25, w2
+ bn.subb w17, w26, w3
+ bn.add w10, w16, w12
+ bn.addc w11, w17, w13
+ bn.sel w16, w10, w16, C
+ bn.sel w17, w11, w17, C
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */
+ bn.mov w10, w8
+ bn.mov w11, w9
+ bn.mov w16, w29
+ bn.mov w17, w30
+ jal x1, barrett384_p384
+ bn.mov w29, w16
+ bn.mov w30, w17
+
+ /* 42: [w3, w2] = t1 <= t3*t0 = [w7, w6]*[w1, w0] */
+ bn.mov w10, w6
+ bn.mov w11, w7
+ bn.mov w16, w0
+ bn.mov w17, w1
+ jal x1, barrett384_p384
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* 43: [w30, w29] = Z3 <= Z3+t1 = [w30, w29]+[w3, w2] */
+ bn.add w16, w29, w2
+ bn.addc w17, w30, w3
+ bn.sub w10, w16, w12
+ bn.subb w11, w17, w13
+ bn.sel w16, w16, w10, C
+ bn.sel w17, w17, w11, C
+ bn.mov w29, w16
+ bn.mov w30, w17
+
+ ret
+
+
+.section .data
+
+/* P-384 domain parameter b */
+.globl p384_b
+p384_b:
+ .word 0xd3ec2aef
+ .word 0x2a85c8ed
+ .word 0x8a2ed19d
+ .word 0xc656398d
+ .word 0x5013875a
+ .word 0x0314088f
+ .word 0xfe814112
+ .word 0x181d9c6e
+ .word 0xe3f82d19
+ .word 0x988e056b
+ .word 0xe23ee7e4
+ .word 0xb3312fa7
+ .zero 16
+
+/* P-384 domain parameter p (modulus) */
+.globl p384_p
+p384_p:
+ .word 0xffffffff
+ .word 0x00000000
+ .word 0x00000000
+ .word 0xffffffff
+ .word 0xfffffffe
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .zero 16
+
+/* barrett constant u for modulus p */
+.globl p384_u_p
+p384_u_p:
+ .word 0x00000001
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0x00000000
+ .word 0x00000001
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .zero 16
+
+/* P-384 domain parameter n (order of base point) */
+.globl p384_n
+p384_n:
+ .word 0xccc52973
+ .word 0xecec196a
+ .word 0x48b0a77a
+ .word 0x581a0db2
+ .word 0xf4372ddf
+ .word 0xc7634d81
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .word 0xffffffff
+ .zero 16
+
+/* barrett constant u for n */
+.globl p384_u_n
+p384_u_n:
+ .word 0x333ad68d
+ .word 0x1313e695
+ .word 0xb74f5885
+ .word 0xa7e5f24d
+ .word 0x0bc8d220
+ .word 0x389cb27e
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .word 0x00000000
+ .zero 16
+
+/* P-384 basepoint G affine x-coordinate */
+.globl p384_gx
+p384_gx:
+ .word 0x72760ab7
+ .word 0x3a545e38
+ .word 0xbf55296c
+ .word 0x5502f25d
+ .word 0x82542a38
+ .word 0x59f741e0
+ .word 0x8ba79b98
+ .word 0x6e1d3b62
+ .word 0xf320ad74
+ .word 0x8eb1c71e
+ .word 0xbe8b0537
+ .word 0xaa87ca22
+ .zero 16
+
+/* P-384 basepoint G affine y-coordinate */
+.globl p384_gy
+p384_gy:
+ .word 0x90ea0e5f
+ .word 0x7a431d7c
+ .word 0x1d7e819d
+ .word 0x0a60b1ce
+ .word 0xb5f0b8c0
+ .word 0xe9da3113
+ .word 0x289a147c
+ .word 0xf8f41dbd
+ .word 0x9292dc29
+ .word 0x5d9e98bf
+ .word 0x96262c6f
+ .word 0x3617de4a
+ .zero 16
diff --git a/sw/otbn/code-snippets/p384_sign.s b/sw/otbn/code-snippets/p384_sign.s
new file mode 100644
index 0000000..e72f2d5
--- /dev/null
+++ b/sw/otbn/code-snippets/p384_sign.s
@@ -0,0 +1,1011 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ * P-384 specific routines for ECDSA signature generation and constant-time
+ * scalar multiplication.
+ */
+
+ .section .text
+
+/**
+ * Convert projective coordinates of a P-384 curve point to affine coordinates
+ *
+ * returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
+ * where P is a valid P-384 curve point,
+ * x_a and y_a are the resulting affine coordinates of the
+ * curve point,
+ * x,y and z are a set of projective coordinates of the
+ * point and
+ * p is the modulus of the P-384 underlying finite field.
+ *
+ * This routine computes the affine coordinates for a set of projective
+ * coordinates of a valid P-384 curve point. The routine performs the required
+ * divisions by computing the multiplicative modular inverse of the
+ * projective z-coordinate in the underlying finite field of the P-384 curve.
+ * For inverse computation Fermat's little theorem is used, i.e.
+ * we compute z^-1 = z^(p-2) mod p.
+ * For exponentiation a 16 step addition chain is used.
+ * Source of the addition chain is the addchain project:
+ * https://github.com/mmcloughlin/addchain/
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] [w26,w25]: x, x-coordinate of curve point (projective).
+ * @param[in] [w26,w25]: y, y-coordinate of curve point (projective).
+ * @param[in] [w30,w29]: z, z-coordinate of curve point (projective).
+ * @param[in] [w13, w12]: p, modulus of P-384.
+ * @param[in] [w15, w14]: u, pre-computed Barrett constant for p,
+ * lower 384 bits, i.e. (2^(2*384) div p)[383:0].
+ * @param[in] w31: all-zero.
+ * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
+ * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
+ *
+ * clobbered registers: w0 to w28
+ * clobbered flag groups: FG0
+ */
+proj_to_affine_p384:
+
+ /* Exp: 0b10 = 2*0b1
+ Val: r10 = z^2 mod p
+ [w17,w16] <= [w30,w29]^2 mod [w13,w12] */
+ bn.mov w10, w29
+ bn.mov w11, w30
+ bn.mov w16, w29
+ bn.mov w17, w30
+ jal x1, barrett384_p384
+
+ /* Exp: 0b11 = 0b1+0b10
+ Val: r11 <= z*r10 mod p
+ [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
+ bn.mov w10, w29
+ bn.mov w11, w30
+ jal x1, barrett384_p384
+
+ /* Exp: 0b110 = 2*0b11
+ Val: r110 = r11^2 mod p
+ [w17,w16] <= [w17,w16]^2 mod [w13,w12] */
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+
+ /* Exp: 0b111 = 0b1+0b110
+ Val: r111 <= z*r110 mod p
+ [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
+ bn.mov w10, w29
+ bn.mov w11, w30
+ jal x1, barrett384_p384
+ bn.mov w0, w16
+ bn.mov w1, w17
+
+ /* Exp: 0b111000 = 0b111<<3
+ Val: r111000 <= r111^(2^3) mod p
+ [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
+ loopi 3, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+
+ /* Exp: 0b1111111 = 0b111+0b111000
+ Val: r1111111 <= r111*r111000 mod p
+ [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
+ bn.mov w10, w0
+ bn.mov w11, w1
+ jal x1, barrett384_p384
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* Exp: 2^12-1 = (0b1111111<<6)+0b111111
+ Val: r_12_1 <= r111111^(2^6)*r111111 mod p
+ [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
+ loopi 6, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w2
+ bn.mov w11, w3
+ jal x1, barrett384_p384
+ bn.mov w4, w16
+ bn.mov w5, w17
+
+ /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
+ Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
+ [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
+ loopi 12, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w4
+ bn.mov w11, w5
+ jal x1, barrett384_p384
+
+ /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
+ Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
+ [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
+ loopi 6, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w2
+ bn.mov w11, w3
+ jal x1, barrett384_p384
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* Exp: 2^31-1 <= (2^30-1)*2+0b1
+ Val: r_31_1 <= r30_1^2*z mod p
+ [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ bn.mov w10, w29
+ bn.mov w11, w30
+ jal x1, barrett384_p384
+ bn.mov w6, w16
+ bn.mov w7, w17
+
+ /* Exp: 2^32-1 <= (2^30-1)*2+0b1
+ Val: r_32_1 <= r31_1^2*z mod p
+ [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ bn.mov w10, w29
+ bn.mov w11, w30
+ jal x1, barrett384_p384
+ bn.mov w9, w16
+ bn.mov w8, w17
+
+ /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
+ Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
+ [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
+ loopi 31, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w6
+ bn.mov w11, w7
+ jal x1, barrett384_p384
+ bn.mov w6, w16
+ bn.mov w7,w17
+
+ /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
+ Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
+ [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
+ loopi 63, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w6
+ bn.mov w11, w7
+ jal x1, barrett384_p384
+ bn.mov w6, w16
+ bn.mov w7, w17
+
+ /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
+ Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
+ [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
+ loopi 126, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w6
+ bn.mov w11, w7
+ jal x1, barrett384_p384
+
+ /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
+ Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
+ [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
+ loopi 3, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w0
+ bn.mov w11, w1
+ jal x1, barrett384_p384
+
+ /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
+ Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
+ [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
+ *[w30,w29] mod [w13,w12] */
+ loopi 33, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w9
+ bn.mov w11, w8
+ jal x1, barrett384_p384
+ loopi 94, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w2
+ bn.mov w11, w3
+ jal x1, barrett384_p384
+ loopi 2, 4
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+ nop
+ bn.mov w10, w29
+ bn.mov w11, w30
+ jal x1, barrett384_p384
+
+ /* store inverse [w1,w0] <= [w17,w16] = z_inv*/
+ bn.mov w0, w16
+ bn.mov w1, w17
+
+ /* convert x-coordinate to affine space
+ [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
+ bn.mov w10, w25
+ bn.mov w11, w26
+ jal x1, barrett384_p384
+ bn.mov w25, w16
+ bn.mov w26, w17
+
+ /* convert y-coordinate to affine space
+ [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
+ bn.mov w10, w27
+ bn.mov w11, w28
+ bn.mov w16, w0
+ bn.mov w17, w1
+ jal x1, barrett384_p384
+ bn.mov w27, w16
+ bn.mov w28, w17
+
+ ret
+
+
+/**
+ * Fetch curve point from dmem, randomize z-coordinate and store point in dmem
+ *
+ * returns P = (x, y, z) = (x_a*z, y_a*z, z)
+ * with P being a valid P-384 curve point in projective coordinates
+ * x_a and y_a being the affine coordinates as fetched from dmem
+ * z being a randomized z-coordinate
+ *
+ * This routines fetches the affine x- and y-coordinates of a curve point from
+ * dmem and computes a valid set of projective coordinates. The z-coordinate is
+ * randomized and x and y are scaled appropriately. The resulting projective
+ * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
+ * i.e. each coordinate is stored 512 bit aligned, little endian.
+ * This routine runs in constant time.
+ *
+ * @param[in] x20: dptr_x, pointer to dmem location containing affine
+ * x-coordinate of input point
+ * @param[in] x21: dptr_y, pointer to dmem location containing affine
+ * y-coordinate of input point
+ * @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
+ * modulus p
+ * @param[in] [w13, w12]: p, modulus of P-384 underlying finite field
+ * @param[in] w31: all-zero
+ * @param[in] x18: dptr_p_p, pointer to dmem location to store resulting point
+ * in projective space
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ * the upper limb of projective y-coordinate.
+ *
+ * clobbered registers: x10, x11 to x13
+ * w2, w3, w8 to w11, w16 to w24, w29, w30
+ * clobbered flag groups: FG0
+ */
+store_proj_randomize:
+
+ /* get a 384-bit random number
+ [w3, w2] = random(384) */
+ bn.wsrr w2, 1
+ bn.wsrr w3, 1
+ bn.rshi w3, w31, w3 >> 128
+
+ /* reduce random number
+ [w2, w3] = z <= [w2, w3] mod p */
+ bn.sub w10, w2, w12
+ bn.subb w11, w3, w13
+ bn.sel w2, w2, w10, C
+ bn.sel w3, w3, w11, C
+
+ bn.mov w10, w2
+ bn.mov w11, w3
+
+ /* store z-coordinate
+ dmem[x20+128] = [w10, w11] */
+ li x10, 10
+ li x11, 11
+ bn.sid x10, 128(x18)
+ bn.sid x11, 160(x18)
+
+ /* fetch x-coordinate from dmem
+ [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
+ li x12, 16
+ li x13, 17
+ bn.lid x12, 0(x20)
+ bn.lid x13, 32(x20)
+
+ /* scale and store x-coordinate
+ [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
+ x_p <= [w11, w10] * [w17, w16] = z*x mod p */
+
+ jal x1, barrett384_p384
+ bn.sid x12, 0(x18)
+ bn.sid x13, 32(x18)
+
+ /* fetch y-coordinate from dmem
+ [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
+ bn.lid x12, 0(x21)
+ bn.lid x13, 32(x21)
+
+ /* scale and store y-coordinate
+ [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
+ y_p <= [w11, w10] * [w17, w16] = z*y mod p */
+ bn.mov w10, w2
+ bn.mov w11, w3
+ jal x1, barrett384_p384
+ bn.sid x12, 64(x18)
+ bn.sid x13, 96(x18)
+
+ ret
+
+
+/**
+ * P-384 scalar point multiplication in affine space
+ *
+ * returns R = k*P = k*(x_p, y_p)
+ * where R, P are valid P-384 curve points in affine coordinates,
+ * k is a 384-bit scalar.
+ *
+ * This routine performs scalar multiplication based on the group laws
+ * of Weierstrass curves.
+ * A constant time double-and-add algorithm (sometimes referred to as
+ * double-and-add-always) is used.
+ * Due to the P-384 optimized implementations of the internally called routines
+ * for point addition and doubling, this routine is limited to P-384 curves.
+ * The routine makes use of blinding by additive splitting the
+ * exponent/scalar d into two shares. The double-and-add loop operates on both
+ * shares in parallel applying Shamir's trick.
+ *
+ * @param[in] x9: dptr_rnd, pointer to location in dmem containing random
+ * number to be used for additive splitting of scalar
+ * @param[in] x19: dptr_k, pointer to scalar k (0 < k < n) in dmem
+ * @param[in] x20: dptr_x, pointer to affine x-coordinate in dmem
+ * @param[in] x21: dptr_y, pointer to affine y-coordinate in dmem
+ * @param[in] x28: dptr_b, pointer to domain parameter b of P-384 in dmem
+ * @param[in] x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
+ * @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u
+ * corresponding to modulus p
+ * @param[in] [w13, w12]: p, modulus of P-384 underlying finite field
+ * @param[in] [w11, w10]: n, domain parameter of P-384 curve
+ * (order of basepoint G)
+ * @param[in] w31: all-zero
+ * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R.
+ * @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R.
+ *
+ * Scratchpad memory layout:
+ * The routine expects at least 704 bytes of scratchpad memory at dmem
+ * location 'scratchpad' (sp). Internally the scratchpad is used as follows:
+ * dptr_sp .. dptr_sp+191: point P, projective
+ * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
+ * dptr_sp+256 .. dptr_sp+447: point 2P, projective
+ * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
+ * dptr_sp+512 .. dptr_sp+703: point Q, projective
+ *
+ * Projective coordinates of a point are kept in dmem in little endian format
+ * with the individual coordinates 512 bit aligned. The coordinates are stored
+ * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
+ * curve point occupies 6 consecutive 256-bit dmem cells.
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ * the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
+ * clobbered flag groups: FG0
+ */
+scalar_mult_int_p384:
+
+ /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
+ resetting in very call to point addition routine */
+ li x22, 10
+ li x23, 11
+ li x24, 16
+ li x25, 17
+
+ /* fetch externally supplied random number from dmem
+ [w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */
+ li x2, 0
+ bn.lid x2++, 0(x9)
+ bn.lid x2++, 32(x9)
+
+ /* 1st share (reduced rnd)
+ s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */
+ bn.sub w9, w0, w10
+ bn.subb w8, w1, w11
+ bn.sel w0, w0, w9, C
+ bn.sel w1, w1, w8, C
+
+ /* load scalar k from dmem
+ [w3, w2] = k <= dmem[dptr_k] = [dmem[x19], dmem[x19+32]] */
+ bn.lid x2++, 0(x19)
+ bn.lid x2, 32(x19)
+
+ /* 2nd share (k-s0)
+ s1 = [w3, w2] <= k - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */
+ bn.sub w2, w2, w0
+ bn.subb w3, w3, w1
+ bn.add w8, w2, w10
+ bn.addc w9, w3, w11
+ bn.sel w2, w8, w2, C
+ bn.sel w3, w9, w3, C
+
+ /* left align both shares for probing of MSB in loop body */
+ bn.rshi w1, w1, w0 >> 128
+ bn.rshi w0, w0, w31 >> 128
+ bn.rshi w3, w3, w2 >> 128
+ bn.rshi w2, w2, w31 >> 128
+
+ /* store shares in scratchpad */
+ li x2, 0
+ bn.sid x2++, 192(x30)
+ bn.sid x2++, 224(x30)
+ bn.sid x2++, 448(x30)
+ bn.sid x2++, 480(x30)
+
+ /* get randomized projective coodinates of curve point
+ P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
+ add x18, x30, 0
+ jal x1, store_proj_randomize
+
+ /* double point P
+ 2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
+ add x27, x30, x0
+ add x26, x30, x0
+ jal x1, proj_add_p384
+
+ /* store point 2P in scratchpad @w30+256
+ dmem[dptr_sc+256] = [w30:w25] = 2P */
+ li x2, 25
+ bn.sid x2++, 256(x30)
+ bn.sid x2++, 288(x30)
+ bn.sid x2++, 320(x30)
+ bn.sid x2++, 352(x30)
+ bn.sid x2++, 384(x30)
+ bn.sid x2++, 416(x30)
+
+ /* init point Q = (0,1,0) for double-and-add in scratchpad */
+ /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
+ addi x26, x30, 512
+ li x2, 30
+ bn.addi w30, w31, 1
+ bn.sid x2++, 64(x26)
+ bn.sid x2, 0(x26)
+ bn.sid x2, 32(x26)
+ bn.sid x2, 96(x26)
+ bn.sid x2, 128(x26)
+ bn.sid x2, 160(x26)
+
+ /* double-and-add loop with decreasing index */
+ loopi 384, 85
+
+ /* double point Q
+ Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
+ add x27, x26, x0
+ jal x1, proj_add_p384
+
+ /* store Q in dmem
+ dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
+ li x2, 25
+ bn.sid x2++, 0(x26)
+ bn.sid x2++, 32(x26)
+ bn.sid x2++, 64(x26)
+ bn.sid x2++, 96(x26)
+ bn.sid x2++, 128(x26)
+ bn.sid x2++, 160(x26)
+
+ /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
+ is 1.
+ If only one MSb is set, select P for addition.
+ If both MSbs are set, select 2P for addition.
+ (If neither MSB is set, 2P will be selected but result discarded.) */
+ li x2, 0
+ bn.lid x2++, 224(x30)
+ bn.lid x2, 480(x30)
+ bn.xor w8, w0, w1
+ /* Create conditional offeset into scratchpad.
+ if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
+ csrrs x3, 0x7c0, x0
+ xori x3, x3, -1
+ andi x3, x3, 2
+ slli x27, x3, 7
+ add x27, x27, x30
+
+ /* Reload randomized projective coodinates for curve point P.
+ P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
+ jal x1, store_proj_randomize
+
+ /* Add points Q+P or Q+2P depending on offset in x27.
+ Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
+ jal x1, proj_add_p384
+
+ /* load shares from scratchpad
+ [w1, w0] = s0; [w3, w2] = s1 */
+ li x2, 0
+ bn.lid x2++, 192(x30)
+ bn.lid x2++, 224(x30)
+ bn.lid x2++, 448(x30)
+ bn.lid x2++, 480(x30)
+
+ /* M = s0[511] | s1[511] */
+ bn.or w8, w1, w3
+
+ /* load q from scratchpad
+ Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
+ li x2, 4
+ bn.lid x2++, 0(x26)
+ bn.lid x2++, 32(x26)
+ bn.lid x2++, 64(x26)
+ bn.lid x2++, 96(x26)
+ bn.lid x2++, 128(x26)
+ bn.lid x2++, 160(x26)
+
+ /* select either Q or Q_a
+ if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
+ bn.sel w25, w25, w4, M
+ bn.sel w26, w26, w5, M
+ bn.sel w27, w27, w6, M
+ bn.sel w28, w28, w7, M
+ bn.sel w29, w29, w8, M
+ bn.sel w30, w30, w9, M
+
+ /* store Q in dmem
+ dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
+ li x2, 25
+ bn.sid x2++, 0(x26)
+ bn.sid x2++, 32(x26)
+ bn.sid x2++, 64(x26)
+ bn.sid x2++, 96(x26)
+ bn.sid x2++, 128(x26)
+ bn.sid x2++, 160(x26)
+
+ /* left shift both shares
+ s0 <= s0 << 1 ; s1 <= s1 << 1 */
+ bn.add w0, w0, w0
+ bn.addc w1, w1, w1
+ bn.add w2, w2, w2
+ bn.addc w3, w3, w3
+ /* store both shares in scratchpad */
+ li x2, 0
+ bn.sid x2++, 192(x30)
+ bn.sid x2++, 224(x30)
+ bn.sid x2++, 448(x30)
+ bn.sid x2++, 480(x30)
+
+
+ /* Get a fresh random number and scale the coordinates of 2P.
+ (scaling each proj. coordinate by same factor results in same point) */
+
+ /* get a 384-bit random number */
+ bn.wsrr w2, 1
+ bn.wsrr w3, 1
+ bn.rshi w3, w31, w3 >> 128
+
+ /* reduce random number
+ [w2, w3] = z <= [w2, w3] mod p */
+ bn.sub w10, w2, w12
+ bn.subb w11, w3, w13
+ bn.sel w2, w2, w10, C
+ bn.sel w3, w3, w11, C
+
+ /* scale all coordinates in scratchpad */
+ li x2, 16
+ li x3, 17
+ /* x-coordinate */
+ bn.mov w10, w2
+ bn.mov w11, w3
+ bn.lid x2, 256(x30)
+ bn.lid x3, 288(x30)
+ jal x1, barrett384_p384
+ bn.sid x2, 256(x30)
+ bn.sid x3, 288(x30)
+ /* y-coordinate */
+ bn.mov w10, w2
+ bn.mov w11, w3
+ bn.lid x2, 320(x30)
+ bn.lid x3, 352(x30)
+ jal x1, barrett384_p384
+ bn.sid x2, 320(x30)
+ bn.sid x3, 352(x30)
+ /* z-coordinate */
+ bn.mov w10, w2
+ bn.mov w11, w3
+ bn.lid x2, 384(x30)
+ bn.lid x3, 416(x30)
+ jal x1, barrett384_p384
+ bn.sid x2, 384(x30)
+ bn.sid x3, 416(x30)
+
+ /* convert coordinates to affine space */
+ jal x1, proj_to_affine_p384
+
+ ret
+
+
+/**
+ * Externally callable wrapper for P-384 scalar point multiplication
+ *
+ * returns R = k*P = k*(x_p, y_p)
+ * where R, P are valid P-384 curve points in affine coordinates,
+ * k is a 384-bit scalar..
+ *
+ * Sets up context and calls internal scalar multiplication routine.
+ * This routine runs in constant time.
+ *
+ * @param[in] dmem[0]: dK, pointer to location in dmem containing scalar k
+ * @param[in] dmem[4]: dRnd, pointer to location in dmem containing random
+ * number for blinding
+ * @param[in] dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
+ * @param[in] dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
+ *
+ * 384-bit quantities have to be provided in dmem in little-endian format,
+ * 512 bit aligned, with the highest 128 bit set to zero.
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ * the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
+ * w0 to w30
+ * clobbered flag groups: FG0
+ */
+.globl scalar_mult_p384
+scalar_mult_p384:
+
+ /* set dmem pointer to point x-coordinate */
+ la x20, dptr_x
+ lw x20, 0(x20)
+
+ /* set dmem pointer to point y-coordinate */
+ la x21, dptr_y
+ lw x21, 0(x21)
+
+ /* set dmem pointer to scalar k */
+ la x19, dptr_k
+ lw x19, 0(x19)
+
+ /* set pointer to blinding parameter */
+ la x9, dptr_rnd
+ lw x9, 0(x9)
+
+ /* set dmem pointer to domain parameter b */
+ la x28, p384_b
+
+ /* set dmem pointer to scratchpad */
+ la x30, scratchpad
+
+ /* load domain parameter p (modulus)
+ [w13, w12] = p = dmem[p384_p] */
+ li x2, 12
+ la x3, p384_p
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* load Barrett constant u for modulus p
+ [w15, w14] = u_p = dmem[p384_u_p] */
+ li x2, 14
+ la x3, p384_u_p
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* load domain parameter n (order of base point)
+ [w11, w10] = n = dmem[p384_n] */
+ li x2, 10
+ la x3, p384_n
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* init all-zero reg */
+ bn.xor w31, w31, w31
+
+ jal x1, scalar_mult_int_p384
+
+ ret
+
+
+/**
+ * Variable-time modular multiplicative inverse computation
+ *
+ * returns x_inv = x^-1 mod m
+ *
+ * This routine computes the modular multiplicative inverse for any x < m in
+ * the finite field GF(m) where m is prime.
+ *
+ * For inverse computation, Fermat's little theorem is used, i.e.
+ * we compute x^-1 = x^(m-2) mod m.
+ * For exponentiation we use a standard, variable-time (!) square and multiply
+ * algorithm.
+ *
+ * This routine is mainly intended to be used for inversion of scalars in
+ * context of the P-384 curve. In theory, it can be used with any 384-bit
+ * modulus m with a corresponding 385-bit Barrett constant u,
+ * where u[383:192] = 0.
+ *
+ * Note: When used for P-384 scalar inversion, the routine will need 672 calls
+ * to the multiplication routine. By using an adder chain this could be reduced
+ * to ~433 multiplications, however, at the cost of a significant codes size
+ * increase.
+ *
+ * Note: This routine runs in variable-time w.r.t. the modulus. It should only
+ * be used with a non-secret modulus.
+ *
+ * @param[in] [w13, w12]: m, 384 bit modulus
+ * @param[in] [w15, w14]: u[383:0], lower 384 bit of pre-computed Barrett
+ * constant corresponding to modulus m
+ * @param[in] [w30, w29]: x, 384 bit operand
+ * @param[in] w31, all-zero
+ * @param[out] [w17, w16]: x_inv, modular multiplicative inverse
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2, w2, w3, w10, w11, w16 to w24
+ * clobbered flag groups: FG0
+ */
+mod_inv_n_p384:
+
+ /* subtract 2 from modulus for Fermat's little theorem
+ [w13,w12] <= m - 2 = [w11,w10]-2 (left aligned) */
+ bn.subi w2, w12, 2
+ bn.subb w3, w13, w31
+ bn.rshi w3, w3, w2 >> 128
+ bn.rshi w2, w2, w31 >> 128
+
+ /* init square and multiply: [w17,w16] = 1 */
+ bn.addi w16, w31, 1
+ bn.mov w17, w31
+
+ /* square and multiply loop */
+ loopi 384, 12
+
+ /* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */
+ bn.mov w10, w16
+ bn.mov w11, w17
+ jal x1, barrett384_p384
+
+ /* shift MSB into carry flag
+ [w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */
+ bn.add w2, w2, w2
+ bn.addc w3, w3, w3
+
+ /* skip multiplication if C flag not set */
+ csrrs x2, 1984, x0
+ andi x2, x2, 1
+ beq x2, x0, nomul
+
+ /* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */
+ bn.mov w10, w29
+ bn.mov w11, w30
+ jal x1, barrett384_p384
+
+ nomul:
+ nop
+
+ ret
+
+
+/**
+ * P-384 ECDSA signature generation
+ *
+ * returns the signature as the pair r, s with
+ * r = x_1 mod n
+ * and s = k^(-1)(msg + r*d) mod n
+ * where x_1 is the affine x-coordinate of the curve point k*G,
+ * G is the curve's base point,
+ * k is a supplied secret random number,
+ * n is the order of the base point G of P-256,
+ * msg is the message to be signed, and
+ * d is the private key.
+ *
+ * This routine runs in constant time.
+ *
+ * @param[in] dmem[0]: dptr_k, pointer to a 384 bit random secret in dmem
+ * @param[in] dmem[4]: dptr_rnd, pointer to location in dmem containing
+ * a 384-bit random number for blinding
+ * @param[in] dmem[8]: dptr_msg, pointer to the message to be signed in dmem
+ * @param[in] dmem[12]: dptr_r, pointer to dmem location where s component
+ * of signature will be placed
+ * @param[in] dmem[16]: dptr_s, pointer to dmem location where r component
+ * of signature will be placed
+ * @param[in] dmem[28]: dptr_d, pointer to private key d in dmem
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2, x3, x9 to x13, x18 to x28, x30
+ * w0 to w31
+ * clobbered flag groups: FG0
+ */
+.globl p384_sign
+p384_sign:
+ /* init all-zero reg */
+ bn.xor w31, w31, w31
+
+ /* set dmem pointer to domain parameter b */
+ la x28, p384_b
+
+ /* set dmem pointer to basepoint x-coordinate */
+ la x20, p384_gx
+
+ /* set dmem pointer to basepoint y-coordinate */
+ la x21, p384_gy
+
+ /* set dmem pointer to secret random scalar k */
+ la x19, dptr_k
+ lw x19, 0(x19)
+
+ /* set pointer to blinding parameter */
+ la x9, dptr_rnd
+ lw x9, 0(x9)
+
+ /* set dmem pointer to scratchpad */
+ la x30, scratchpad
+
+ /* load domain parameter p (modulus)
+ [w13, w12] <= p = dmem[dptr_p] */
+ li x2, 12
+ la x3, p384_p
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* load Barrett constant u for modulus p
+ [w15, w14] = u_p = dmem[p384_u_p] */
+ li x2, 14
+ la x3, p384_u_p
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* load domain parameter n (order of base point)
+ [w11, w10] = n = dmem[p384_n] */
+ li x2, 10
+ la x3, p384_n
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* scalar multiplication with base point
+ [w28:w25] <= (x_1, y_1) = k*G */
+ jal x1, scalar_mult_int_p384
+
+ /* store r of signature in dmem: dmem[dptr_r] <= r = [w26,w25] */
+ li x2, 25
+ la x3, dptr_r
+ lw x3, 0(x3)
+ bn.sid x2++, 0(x3)
+ bn.sid x2++, 32(x3)
+
+ /* load secret random number k from dmem
+ [w30,w29] <= k = dmem[dptr_k] */
+ li x2, 29
+ bn.lid x2++, 0(x19)
+ bn.lid x2++, 32(x19)
+
+ /* load domain parameter n (order of base point)
+ [w13, w12] <= p = dmem[p384_n] */
+ li x2, 12
+ la x3, p384_n
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* load Barrett constant u_n for modulus n for scalar operations
+ [w15, w14] <= u_m = dmem[p384_u_n] */
+ li x2, 14
+ la x3, p384_u_n
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* modular multiplicative inverse of k
+ [w3, w2] <= [w17, w16] <= k^(-1) mod n */
+ jal x1, mod_inv_n_p384
+ bn.mov w2, w16
+ bn.mov w3, w17
+
+ /* load private key d from dmem
+ [w11,w10] <= d = dmem[dptr_d] */
+ li x2, 10
+ la x3, dptr_d
+ lw x3, 0(x3)
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* [w17, w16] <= k^(-1)*d mod n = [w17, w16] * [w11, w10] mod [w13, w12] */
+ jal x1, barrett384_p384
+
+ /* [w5, w4] <= [w17, w16]
+ <= r * (k^(-1)*d) mod n = [w26, w25] * [w17, w16] mod [w13, w12] */
+ bn.mov w10, w25
+ bn.mov w11, w26
+ jal x1, barrett384_p384
+ bn.mov w4, w16
+ bn.mov w5, w17
+
+ /* load message from dmem
+ [w11, w10] <= msg = dmem[dptr_msg] */
+ li x2, 10
+ la x3, dptr_msg
+ lw x3, 0(x3)
+ bn.lid x2++, 0(x3)
+ bn.lid x2++, 32(x3)
+
+ /* [w17, w16] <= k^(-1) * msg = [w3, w2]*[w17, w16] mod n */
+ bn.mov w16, w2
+ bn.mov w17, w3
+ jal x1, barrett384_p384
+
+ /* [w28, w27] <= s' = k^(-1)*msg + k^(-1)*r*d = [w17, w16] + [w5, w4]*/
+ bn.add w27, w16, w4
+ bn.addc w28, w17, w5
+
+ /* reduce s: [w28, w27] <= s <= s' mod n = [w28, w27] mod [w13, w12] */
+ bn.sub w10, w27, w12
+ bn.subb w11, w28, w13
+ bn.sel w27, w27, w10, C
+ bn.sel w28, w28, w11, C
+
+ /* store s of signature in dmem: dmem[dptr_s] <= s = [w28, w27] */
+ li x2, 27
+ la x3, dptr_s
+ lw x3, 0(x3)
+ bn.sid x2++, 0(x3)
+ bn.sid x2++, 32(x3)
+
+ ret
+
+
+/* pointers and scratchpad memory */
+.section .data
+
+/* pointer to k (dptr_k) */
+.globl dptr_k
+dptr_k:
+ .zero 4
+
+/* pointer to rnd (dptr_rnd) */
+.globl dptr_rnd
+dptr_rnd:
+ .zero 4
+
+/* pointer to msg (dptr_msg) */
+.globl dptr_msg
+dptr_msg:
+ .zero 4
+
+/* pointer to R (dptr_r) */
+.globl dptr_r
+dptr_r:
+ .zero 4
+
+/* pointer to S (dptr_s) */
+.globl dptr_s
+dptr_s:
+ .zero 4
+
+/* pointer to X (dptr_x) */
+.globl dptr_x
+dptr_x:
+ .zero 4
+
+/* pointer to Y (dptr_y) */
+.globl dptr_y
+dptr_y:
+ .zero 4
+
+/* pointer to D (dptr_d) */
+.globl dptr_d
+dptr_d:
+ .zero 4
+
+/* 704 bytes of scratchpad memory */
+scratchpad:
+ .zero 704
diff --git a/sw/otbn/code-snippets/rules.mk b/sw/otbn/code-snippets/rules.mk
index 280584c..646461f 100644
--- a/sw/otbn/code-snippets/rules.mk
+++ b/sw/otbn/code-snippets/rules.mk
@@ -151,18 +151,26 @@
# p384_proj_add_test depends on p384_proj_add defined in p384.s
$(otbn-code-snippets-bin-dir)/p384_proj_add_test.elf: \
- $(otbn-code-snippets-obj-dir)/p384.o
+ $(otbn-code-snippets-obj-dir)/p384_base.o
$(otbn-code-snippets-bin-dir)/p384_proj_add_test.elf: \
- otbn-libs += $(otbn-code-snippets-obj-dir)/p384.o
+ otbn-libs += $(otbn-code-snippets-obj-dir)/p384_base.o
-# p384_scalar_mult_test depends on p384_scalar_mult_int defined in p384.s
-$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
- $(otbn-code-snippets-obj-dir)/p384.o
-$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
- otbn-libs += $(otbn-code-snippets-obj-dir)/p384.o
+# code in p384_sign depends on code defined in p384_base.s
+$(otbn-code-snippets-bin-dir)/p384_sign.elf: \
+ $(otbn-code-snippets-obj-dir)/p384_base.o
+$(otbn-code-snippets-bin-dir)/p384_sign.elf: \
+ otbn-libs += $(otbn-code-snippets-obj-dir)/p384_base.o
-# p384_ecdsa_sign_test depends on p384_sign defined in p384.s
+# p384_scalar_mult_test depends on scalar_mult_p384 defined in p384_sign.s
+$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
+ $(otbn-code-snippets-obj-dir)/p384_sign.o
+$(otbn-code-snippets-bin-dir)/p384_scalar_mult_test.elf: \
+ otbn-libs += $(otbn-code-snippets-obj-dir)/p384_sign.o \
+ $(otbn-code-snippets-obj-dir)/p384_base.o
+
+# p384_ecdsa_sign_test depends on p384_sign defined in p384_sign.s
$(otbn-code-snippets-bin-dir)/p384_ecdsa_sign_test.elf: \
- $(otbn-code-snippets-obj-dir)/p384.o
+ $(otbn-code-snippets-obj-dir)/p384_sign.o
$(otbn-code-snippets-bin-dir)/p384_ecdsa_sign_test.elf: \
- otbn-libs += $(otbn-code-snippets-obj-dir)/p384.o
+ otbn-libs += $(otbn-code-snippets-obj-dir)/p384_sign.o \
+ $(otbn-code-snippets-obj-dir)/p384_base.o